You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ue...@apache.org on 2021/07/27 00:13:09 UTC
[spark] branch master updated: [SPARK-36260][PYTHON] Add
set_categories to CategoricalAccessor and CategoricalIndex
This is an automated email from the ASF dual-hosted git repository.
ueshin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 55971b7 [SPARK-36260][PYTHON] Add set_categories to CategoricalAccessor and CategoricalIndex
55971b7 is described below
commit 55971b70fe3c899d4516e4955bc7c9ebd4b4af70
Author: Xinrong Meng <xi...@databricks.com>
AuthorDate: Mon Jul 26 17:12:33 2021 -0700
[SPARK-36260][PYTHON] Add set_categories to CategoricalAccessor and CategoricalIndex
### What changes were proposed in this pull request?
Add set_categories to CategoricalAccessor and CategoricalIndex.
### Why are the changes needed?
set_categories is supported in pandas CategoricalAccessor and CategoricalIndex. We ought to follow pandas.
### Does this PR introduce _any_ user-facing change?
Yes, users will be able to use `set_categories`.
### How was this patch tested?
Unit tests.
Closes #33506 from xinrong-databricks/set_categories.
Authored-by: Xinrong Meng <xi...@databricks.com>
Signed-off-by: Takuya UESHIN <ue...@databricks.com>
---
.../source/reference/pyspark.pandas/indexing.rst | 1 +
.../source/reference/pyspark.pandas/series.rst | 1 +
python/pyspark/pandas/categorical.py | 148 ++++++++++++++++++++-
python/pyspark/pandas/indexes/category.py | 81 +++++++++++
python/pyspark/pandas/missing/indexes.py | 1 -
.../pyspark/pandas/tests/indexes/test_category.py | 49 +++++++
python/pyspark/pandas/tests/test_categorical.py | 63 +++++++++
7 files changed, 339 insertions(+), 5 deletions(-)
diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst
index ebf332e..cf898aa 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -181,6 +181,7 @@ Categorical components
CategoricalIndex.as_ordered
CategoricalIndex.as_unordered
CategoricalIndex.rename_categories
+ CategoricalIndex.set_categories
.. _api.multiindex:
diff --git a/python/docs/source/reference/pyspark.pandas/series.rst b/python/docs/source/reference/pyspark.pandas/series.rst
index 95e102f..717c762 100644
--- a/python/docs/source/reference/pyspark.pandas/series.rst
+++ b/python/docs/source/reference/pyspark.pandas/series.rst
@@ -406,6 +406,7 @@ the ``Series.cat`` accessor.
Series.cat.as_ordered
Series.cat.as_unordered
Series.cat.rename_categories
+ Series.cat.set_categories
.. _api.series.plot:
diff --git a/python/pyspark/pandas/categorical.py b/python/pyspark/pandas/categorical.py
index ce9b3ed..cae9ab1 100644
--- a/python/pyspark/pandas/categorical.py
+++ b/python/pyspark/pandas/categorical.py
@@ -20,6 +20,8 @@ import pandas as pd
from pandas.api.types import CategoricalDtype, is_dict_like, is_list_like
from pyspark.pandas.internal import InternalField
+from pyspark.pandas.spark import functions as SF
+from pyspark.sql import functions as F
from pyspark.sql.types import StructField
if TYPE_CHECKING:
@@ -680,12 +682,150 @@ class CategoricalAccessor(object):
def set_categories(
self,
- new_categories: pd.Index,
- ordered: bool = None,
+ new_categories: Union[pd.Index, List],
+ ordered: Optional[bool] = None,
rename: bool = False,
inplace: bool = False,
- ) -> "ps.Series":
- raise NotImplementedError()
+ ) -> Optional["ps.Series"]:
+ """
+ Set the categories to the specified new_categories.
+
+ `new_categories` can include new categories (which will result in
+ unused categories) or remove old categories (which results in values
+ set to NaN). If `rename==True`, the categories will simple be renamed
+ (less or more items than in old categories will result in values set to
+ NaN or in unused categories respectively).
+
+ This method can be used to perform more than one action of adding,
+ removing, and reordering simultaneously and is therefore faster than
+ performing the individual steps via the more specialised methods.
+
+ On the other hand this methods does not do checks (e.g., whether the
+ old categories are included in the new categories on a reorder), which
+ can result in surprising changes, for example when using special string
+ dtypes, which does not considers a S1 string equal to a single char
+ python string.
+
+ Parameters
+ ----------
+ new_categories : Index-like
+ The categories in new order.
+ ordered : bool, default False
+ Whether or not the categorical is treated as a ordered categorical.
+ If not given, do not change the ordered information.
+ rename : bool, default False
+ Whether or not the new_categories should be considered as a rename
+ of the old categories or as reordered categories.
+ inplace : bool, default False
+ Whether or not to reorder the categories in-place or return a copy
+ of this categorical with reordered categories.
+
+ Returns
+ -------
+ Series with reordered categories or None if inplace.
+
+ Raises
+ ------
+ ValueError
+ If new_categories does not validate as categories
+
+ See Also
+ --------
+ rename_categories : Rename categories.
+ reorder_categories : Reorder categories.
+ add_categories : Add new categories.
+ remove_categories : Remove the specified categories.
+ remove_unused_categories : Remove categories which are not used.
+
+ Examples
+ --------
+ >>> s = ps.Series(list("abbccc"), dtype="category")
+ >>> s # doctest: +SKIP
+ 0 a
+ 1 b
+ 2 b
+ 3 c
+ 4 c
+ 5 c
+ dtype: category
+ Categories (3, object): ['a', 'b', 'c']
+
+ >>> s.cat.set_categories(['b', 'c']) # doctest: +SKIP
+ 0 NaN
+ 1 b
+ 2 b
+ 3 c
+ 4 c
+ 5 c
+ dtype: category
+ Categories (2, object): ['b', 'c']
+
+ >>> s.cat.set_categories([1, 2, 3], rename=True) # doctest: +SKIP
+ 0 1
+ 1 2
+ 2 2
+ 3 3
+ 4 3
+ 5 3
+ dtype: category
+ Categories (3, int64): [1, 2, 3]
+
+ >>> s.cat.set_categories([1, 2, 3], rename=True, ordered=True) # doctest: +SKIP
+ 0 1
+ 1 2
+ 2 2
+ 3 3
+ 4 3
+ 5 3
+ dtype: category
+ Categories (3, int64): [1 < 2 < 3]
+ """
+ from pyspark.pandas.frame import DataFrame
+
+ if not is_list_like(new_categories):
+ raise TypeError(
+ "Parameter 'new_categories' must be list-like, was '{}'".format(new_categories)
+ )
+
+ if ordered is None:
+ ordered = self.ordered
+
+ new_dtype = CategoricalDtype(new_categories, ordered=ordered)
+ scol = self._data.spark.column
+
+ if rename:
+ new_scol = (
+ F.when(scol >= len(new_categories), SF.lit(-1).cast(self._data.spark.data_type))
+ .otherwise(scol)
+ .alias(self._data._internal.data_spark_column_names[0])
+ )
+
+ internal = self._data._psdf._internal.with_new_spark_column(
+ self._data._column_label,
+ new_scol,
+ field=self._data._internal.data_fields[0].copy(dtype=new_dtype),
+ )
+
+ if inplace:
+ self._data._psdf._update_internal_frame(internal)
+ return None
+ else:
+ psser = DataFrame(internal)._psser_for(self._data._column_label)
+ return psser._with_new_scol(
+ psser.spark.column, field=psser._internal.data_fields[0]
+ )
+ else:
+ psser = self._data.astype(new_dtype)
+ if inplace:
+ internal = self._data._psdf._internal.with_new_spark_column(
+ self._data._column_label,
+ psser.spark.column,
+ field=psser._internal.data_fields[0],
+ )
+ self._data._psdf._update_internal_frame(internal)
+ return None
+ else:
+ return psser
def _test() -> None:
diff --git a/python/pyspark/pandas/indexes/category.py b/python/pyspark/pandas/indexes/category.py
index 51b14bc..cd95e39 100644
--- a/python/pyspark/pandas/indexes/category.py
+++ b/python/pyspark/pandas/indexes/category.py
@@ -517,6 +517,87 @@ class CategoricalIndex(Index):
self.name
)
+ def set_categories(
+ self,
+ new_categories: Union[pd.Index, List],
+ ordered: Optional[bool] = None,
+ rename: bool = False,
+ inplace: bool = False,
+ ) -> Optional["CategoricalIndex"]:
+ """
+ Set the categories to the specified new_categories.
+
+ `new_categories` can include new categories (which will result in
+ unused categories) or remove old categories (which results in values
+ set to NaN). If `rename==True`, the categories will simple be renamed
+ (less or more items than in old categories will result in values set to
+ NaN or in unused categories respectively).
+
+ This method can be used to perform more than one action of adding,
+ removing, and reordering simultaneously and is therefore faster than
+ performing the individual steps via the more specialised methods.
+
+ On the other hand this methods does not do checks (e.g., whether the
+ old categories are included in the new categories on a reorder), which
+ can result in surprising changes, for example when using special string
+ dtypes, which does not considers a S1 string equal to a single char
+ python string.
+
+ Parameters
+ ----------
+ new_categories : Index-like
+ The categories in new order.
+ ordered : bool, default False
+ Whether or not the categorical is treated as a ordered categorical.
+ If not given, do not change the ordered information.
+ rename : bool, default False
+ Whether or not the new_categories should be considered as a rename
+ of the old categories or as reordered categories.
+ inplace : bool, default False
+ Whether or not to reorder the categories in-place or return a copy
+ of this categorical with reordered categories.
+
+ Returns
+ -------
+ CategoricalIndex with reordered categories or None if inplace.
+
+ Raises
+ ------
+ ValueError
+ If new_categories does not validate as categories
+
+ See Also
+ --------
+ rename_categories : Rename categories.
+ reorder_categories : Reorder categories.
+ add_categories : Add new categories.
+ remove_categories : Remove the specified categories.
+ remove_unused_categories : Remove categories which are not used.
+
+ Examples
+ --------
+ >>> idx = ps.CategoricalIndex(list("abbccc"))
+ >>> idx # doctest: +NORMALIZE_WHITESPACE
+ CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+ categories=['a', 'b', 'c'], ordered=False, dtype='category')
+
+ >>> idx.set_categories(['b', 'c']) # doctest: +NORMALIZE_WHITESPACE
+ CategoricalIndex([nan, 'b', 'b', 'c', 'c', 'c'],
+ categories=['b', 'c'], ordered=False, dtype='category')
+
+ >>> idx.set_categories([1, 2, 3], rename=True)
+ CategoricalIndex([1, 2, 2, 3, 3, 3], categories=[1, 2, 3], ordered=False, dtype='category')
+
+ >>> idx.set_categories([1, 2, 3], rename=True, ordered=True)
+ CategoricalIndex([1, 2, 2, 3, 3, 3], categories=[1, 2, 3], ordered=True, dtype='category')
+ """
+ if inplace:
+ raise ValueError("cannot use inplace with CategoricalIndex")
+
+ return CategoricalIndex(
+ self.to_series().cat.set_categories(new_categories, ordered=ordered, rename=rename)
+ ).rename(self.name)
+
def _test() -> None:
import os
diff --git a/python/pyspark/pandas/missing/indexes.py b/python/pyspark/pandas/missing/indexes.py
index 0f1c316..938aea2 100644
--- a/python/pyspark/pandas/missing/indexes.py
+++ b/python/pyspark/pandas/missing/indexes.py
@@ -123,7 +123,6 @@ class MissingPandasLikeDatetimeIndex(MissingPandasLikeIndex):
class MissingPandasLikeCategoricalIndex(MissingPandasLikeIndex):
# Functions
- set_categories = _unsupported_function("set_categories", cls="CategoricalIndex")
map = _unsupported_function("map", cls="CategoricalIndex")
diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py
index a11f36a..8368839 100644
--- a/python/pyspark/pandas/tests/indexes/test_category.py
+++ b/python/pyspark/pandas/tests/indexes/test_category.py
@@ -311,6 +311,55 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils):
lambda: psidx.rename_categories("x"),
)
+ def test_set_categories(self):
+ pidx = pd.CategoricalIndex(["a", "b", "c", "d"])
+ psidx = ps.from_pandas(pidx)
+
+ self.assert_eq(
+ pidx.set_categories(["a", "c", "b", "o"]),
+ psidx.set_categories(["a", "c", "b", "o"]),
+ )
+ self.assert_eq(
+ pidx.set_categories(["a", "c", "b"]),
+ psidx.set_categories(["a", "c", "b"]),
+ )
+ self.assert_eq(
+ pidx.set_categories(["a", "c", "b", "d", "e"]),
+ psidx.set_categories(["a", "c", "b", "d", "e"]),
+ )
+
+ self.assert_eq(
+ pidx.set_categories([0, 1, 3, 2], rename=True),
+ psidx.set_categories([0, 1, 3, 2], rename=True),
+ )
+ self.assert_eq(
+ pidx.set_categories([0, 1, 3], rename=True),
+ psidx.set_categories([0, 1, 3], rename=True),
+ )
+ self.assert_eq(
+ pidx.set_categories([0, 1, 3, 2, 4], rename=True),
+ psidx.set_categories([0, 1, 3, 2, 4], rename=True),
+ )
+
+ self.assert_eq(
+ pidx.set_categories(["a", "c", "b", "o"], ordered=True),
+ psidx.set_categories(["a", "c", "b", "o"], ordered=True),
+ )
+ self.assert_eq(
+ pidx.set_categories(["a", "c", "b"], ordered=True),
+ psidx.set_categories(["a", "c", "b"], ordered=True),
+ )
+ self.assert_eq(
+ pidx.set_categories(["a", "c", "b", "d", "e"], ordered=True),
+ psidx.set_categories(["a", "c", "b", "d", "e"], ordered=True),
+ )
+
+ self.assertRaisesRegex(
+ ValueError,
+ "cannot use inplace with CategoricalIndex",
+ lambda: psidx.set_categories(["a", "c", "b", "o"], inplace=True),
+ )
+
if __name__ == "__main__":
import unittest
diff --git a/python/pyspark/pandas/tests/test_categorical.py b/python/pyspark/pandas/tests/test_categorical.py
index 4122efa..67cdf3c 100644
--- a/python/pyspark/pandas/tests/test_categorical.py
+++ b/python/pyspark/pandas/tests/test_categorical.py
@@ -668,6 +668,69 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
lambda: psser.cat.rename_categories("x"),
)
+ def test_set_categories(self):
+ pdf, psdf = self.df_pair
+
+ pser = pdf.b
+ psser = psdf.b
+
+ self.assert_eq(
+ pser.cat.set_categories(["a", "c", "b", "o"]),
+ psser.cat.set_categories(["a", "c", "b", "o"]),
+ )
+ self.assert_eq(
+ pser.cat.set_categories(["a", "c", "b"]),
+ psser.cat.set_categories(["a", "c", "b"]),
+ )
+ self.assert_eq(
+ pser.cat.set_categories(["a", "c", "b", "d", "e"]),
+ psser.cat.set_categories(["a", "c", "b", "d", "e"]),
+ )
+
+ self.assert_eq(
+ pser.cat.set_categories([0, 1, 3, 2], rename=True),
+ psser.cat.set_categories([0, 1, 3, 2], rename=True),
+ )
+ self.assert_eq(
+ pser.cat.set_categories([0, 1, 3], rename=True),
+ psser.cat.set_categories([0, 1, 3], rename=True),
+ )
+ self.assert_eq(
+ pser.cat.set_categories([0, 1, 3, 2, 4], rename=True),
+ psser.cat.set_categories([0, 1, 3, 2, 4], rename=True),
+ )
+
+ self.assert_eq(
+ pser.cat.set_categories(["a", "c", "b", "o"], ordered=True),
+ psser.cat.set_categories(["a", "c", "b", "o"], ordered=True),
+ )
+ self.assert_eq(
+ pser.cat.set_categories(["a", "c", "b"], ordered=True),
+ psser.cat.set_categories(["a", "c", "b"], ordered=True),
+ )
+ self.assert_eq(
+ pser.cat.set_categories(["a", "c", "b", "d", "e"], ordered=True),
+ psser.cat.set_categories(["a", "c", "b", "d", "e"], ordered=True),
+ )
+
+ self.assert_eq(
+ pser.cat.set_categories(["a", "c", "b", "o"], inplace=True, rename=True),
+ psser.cat.set_categories(["a", "c", "b", "o"], inplace=True, rename=True),
+ )
+ self.assert_eq(pser, psser)
+ self.assert_eq(pdf, psdf)
+
+ pser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
+ psser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
+ self.assert_eq(pser, psser)
+ self.assert_eq(pdf, psdf)
+
+ self.assertRaisesRegex(
+ TypeError,
+ "Parameter 'new_categories' must be list-like, was",
+ lambda: psser.cat.set_categories(None),
+ )
+
if __name__ == "__main__":
import unittest
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org