You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ue...@apache.org on 2021/07/27 00:13:09 UTC
[spark] branch master updated: [SPARK-36260][PYTHON] Add set_categories to CategoricalAccessor and CategoricalIndex

This is an automated email from the ASF dual-hosted git repository.

ueshin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 55971b7  [SPARK-36260][PYTHON] Add set_categories to CategoricalAccessor and CategoricalIndex
55971b7 is described below

commit 55971b70fe3c899d4516e4955bc7c9ebd4b4af70
Author: Xinrong Meng <xi...@databricks.com>
AuthorDate: Mon Jul 26 17:12:33 2021 -0700

    [SPARK-36260][PYTHON] Add set_categories to CategoricalAccessor and CategoricalIndex
    
    ### What changes were proposed in this pull request?
    Add set_categories to CategoricalAccessor and CategoricalIndex.
    
    ### Why are the changes needed?
    set_categories is supported in pandas CategoricalAccessor and CategoricalIndex. We ought to follow pandas.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, users will be able to use `set_categories`.
    
    ### How was this patch tested?
    Unit tests.
    
    Closes #33506 from xinrong-databricks/set_categories.
    
    Authored-by: Xinrong Meng <xi...@databricks.com>
    Signed-off-by: Takuya UESHIN <ue...@databricks.com>
---
 .../source/reference/pyspark.pandas/indexing.rst   |   1 +
 .../source/reference/pyspark.pandas/series.rst     |   1 +
 python/pyspark/pandas/categorical.py               | 148 ++++++++++++++++++++-
 python/pyspark/pandas/indexes/category.py          |  81 +++++++++++
 python/pyspark/pandas/missing/indexes.py           |   1 -
 .../pyspark/pandas/tests/indexes/test_category.py  |  49 +++++++
 python/pyspark/pandas/tests/test_categorical.py    |  63 +++++++++
 7 files changed, 339 insertions(+), 5 deletions(-)

diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst
index ebf332e..cf898aa 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -181,6 +181,7 @@ Categorical components
    CategoricalIndex.as_ordered
    CategoricalIndex.as_unordered
    CategoricalIndex.rename_categories
+   CategoricalIndex.set_categories
 
 .. _api.multiindex:
 
diff --git a/python/docs/source/reference/pyspark.pandas/series.rst b/python/docs/source/reference/pyspark.pandas/series.rst
index 95e102f..717c762 100644
--- a/python/docs/source/reference/pyspark.pandas/series.rst
+++ b/python/docs/source/reference/pyspark.pandas/series.rst
@@ -406,6 +406,7 @@ the ``Series.cat`` accessor.
    Series.cat.as_ordered
    Series.cat.as_unordered
    Series.cat.rename_categories
+   Series.cat.set_categories
 
 .. _api.series.plot:
 
diff --git a/python/pyspark/pandas/categorical.py b/python/pyspark/pandas/categorical.py
index ce9b3ed..cae9ab1 100644
--- a/python/pyspark/pandas/categorical.py
+++ b/python/pyspark/pandas/categorical.py
@@ -20,6 +20,8 @@ import pandas as pd
 from pandas.api.types import CategoricalDtype, is_dict_like, is_list_like
 
 from pyspark.pandas.internal import InternalField
+from pyspark.pandas.spark import functions as SF
+from pyspark.sql import functions as F
 from pyspark.sql.types import StructField
 
 if TYPE_CHECKING:
@@ -680,12 +682,150 @@ class CategoricalAccessor(object):
 
     def set_categories(
         self,
-        new_categories: pd.Index,
-        ordered: bool = None,
+        new_categories: Union[pd.Index, List],
+        ordered: Optional[bool] = None,
         rename: bool = False,
         inplace: bool = False,
-    ) -> "ps.Series":
-        raise NotImplementedError()
+    ) -> Optional["ps.Series"]:
+        """
+        Set the categories to the specified new_categories.
+
+        `new_categories` can include new categories (which will result in
+        unused categories) or remove old categories (which results in values
+        set to NaN). If `rename==True`, the categories will simple be renamed
+        (less or more items than in old categories will result in values set to
+        NaN or in unused categories respectively).
+
+        This method can be used to perform more than one action of adding,
+        removing, and reordering simultaneously and is therefore faster than
+        performing the individual steps via the more specialised methods.
+
+        On the other hand this methods does not do checks (e.g., whether the
+        old categories are included in the new categories on a reorder), which
+        can result in surprising changes, for example when using special string
+        dtypes, which does not considers a S1 string equal to a single char
+        python string.
+
+        Parameters
+        ----------
+        new_categories : Index-like
+           The categories in new order.
+        ordered : bool, default False
+           Whether or not the categorical is treated as a ordered categorical.
+           If not given, do not change the ordered information.
+        rename : bool, default False
+           Whether or not the new_categories should be considered as a rename
+           of the old categories or as reordered categories.
+        inplace : bool, default False
+           Whether or not to reorder the categories in-place or return a copy
+           of this categorical with reordered categories.
+
+        Returns
+        -------
+        Series with reordered categories or None if inplace.
+
+        Raises
+        ------
+        ValueError
+            If new_categories does not validate as categories
+
+        See Also
+        --------
+        rename_categories : Rename categories.
+        reorder_categories : Reorder categories.
+        add_categories : Add new categories.
+        remove_categories : Remove the specified categories.
+        remove_unused_categories : Remove categories which are not used.
+
+        Examples
+        --------
+        >>> s = ps.Series(list("abbccc"), dtype="category")
+        >>> s  # doctest: +SKIP
+        0    a
+        1    b
+        2    b
+        3    c
+        4    c
+        5    c
+        dtype: category
+        Categories (3, object): ['a', 'b', 'c']
+
+        >>> s.cat.set_categories(['b', 'c'])  # doctest: +SKIP
+        0    NaN
+        1      b
+        2      b
+        3      c
+        4      c
+        5      c
+        dtype: category
+        Categories (2, object): ['b', 'c']
+
+        >>> s.cat.set_categories([1, 2, 3], rename=True)  # doctest: +SKIP
+        0    1
+        1    2
+        2    2
+        3    3
+        4    3
+        5    3
+        dtype: category
+        Categories (3, int64): [1, 2, 3]
+
+        >>> s.cat.set_categories([1, 2, 3], rename=True, ordered=True)  # doctest: +SKIP
+        0    1
+        1    2
+        2    2
+        3    3
+        4    3
+        5    3
+        dtype: category
+        Categories (3, int64): [1 < 2 < 3]
+        """
+        from pyspark.pandas.frame import DataFrame
+
+        if not is_list_like(new_categories):
+            raise TypeError(
+                "Parameter 'new_categories' must be list-like, was '{}'".format(new_categories)
+            )
+
+        if ordered is None:
+            ordered = self.ordered
+
+        new_dtype = CategoricalDtype(new_categories, ordered=ordered)
+        scol = self._data.spark.column
+
+        if rename:
+            new_scol = (
+                F.when(scol >= len(new_categories), SF.lit(-1).cast(self._data.spark.data_type))
+                .otherwise(scol)
+                .alias(self._data._internal.data_spark_column_names[0])
+            )
+
+            internal = self._data._psdf._internal.with_new_spark_column(
+                self._data._column_label,
+                new_scol,
+                field=self._data._internal.data_fields[0].copy(dtype=new_dtype),
+            )
+
+            if inplace:
+                self._data._psdf._update_internal_frame(internal)
+                return None
+            else:
+                psser = DataFrame(internal)._psser_for(self._data._column_label)
+                return psser._with_new_scol(
+                    psser.spark.column, field=psser._internal.data_fields[0]
+                )
+        else:
+            psser = self._data.astype(new_dtype)
+            if inplace:
+                internal = self._data._psdf._internal.with_new_spark_column(
+                    self._data._column_label,
+                    psser.spark.column,
+                    field=psser._internal.data_fields[0],
+                )
+                self._data._psdf._update_internal_frame(internal)
+                return None
+            else:
+                return psser
 
 
 def _test() -> None:
diff --git a/python/pyspark/pandas/indexes/category.py b/python/pyspark/pandas/indexes/category.py
index 51b14bc..cd95e39 100644
--- a/python/pyspark/pandas/indexes/category.py
+++ b/python/pyspark/pandas/indexes/category.py
@@ -517,6 +517,87 @@ class CategoricalIndex(Index):
             self.name
         )
 
+    def set_categories(
+        self,
+        new_categories: Union[pd.Index, List],
+        ordered: Optional[bool] = None,
+        rename: bool = False,
+        inplace: bool = False,
+    ) -> Optional["CategoricalIndex"]:
+        """
+        Set the categories to the specified new_categories.
+
+        `new_categories` can include new categories (which will result in
+        unused categories) or remove old categories (which results in values
+        set to NaN). If `rename==True`, the categories will simple be renamed
+        (less or more items than in old categories will result in values set to
+        NaN or in unused categories respectively).
+
+        This method can be used to perform more than one action of adding,
+        removing, and reordering simultaneously and is therefore faster than
+        performing the individual steps via the more specialised methods.
+
+        On the other hand this methods does not do checks (e.g., whether the
+        old categories are included in the new categories on a reorder), which
+        can result in surprising changes, for example when using special string
+        dtypes, which does not considers a S1 string equal to a single char
+        python string.
+
+        Parameters
+        ----------
+        new_categories : Index-like
+           The categories in new order.
+        ordered : bool, default False
+           Whether or not the categorical is treated as a ordered categorical.
+           If not given, do not change the ordered information.
+        rename : bool, default False
+           Whether or not the new_categories should be considered as a rename
+           of the old categories or as reordered categories.
+        inplace : bool, default False
+           Whether or not to reorder the categories in-place or return a copy
+           of this categorical with reordered categories.
+
+        Returns
+        -------
+        CategoricalIndex with reordered categories or None if inplace.
+
+        Raises
+        ------
+        ValueError
+            If new_categories does not validate as categories
+
+        See Also
+        --------
+        rename_categories : Rename categories.
+        reorder_categories : Reorder categories.
+        add_categories : Add new categories.
+        remove_categories : Remove the specified categories.
+        remove_unused_categories : Remove categories which are not used.
+
+        Examples
+        --------
+        >>> idx = ps.CategoricalIndex(list("abbccc"))
+        >>> idx  # doctest: +NORMALIZE_WHITESPACE
+        CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+                         categories=['a', 'b', 'c'], ordered=False, dtype='category')
+
+        >>> idx.set_categories(['b', 'c'])  # doctest: +NORMALIZE_WHITESPACE
+        CategoricalIndex([nan, 'b', 'b', 'c', 'c', 'c'],
+                         categories=['b', 'c'], ordered=False, dtype='category')
+
+        >>> idx.set_categories([1, 2, 3], rename=True)
+        CategoricalIndex([1, 2, 2, 3, 3, 3], categories=[1, 2, 3], ordered=False, dtype='category')
+
+        >>> idx.set_categories([1, 2, 3], rename=True, ordered=True)
+        CategoricalIndex([1, 2, 2, 3, 3, 3], categories=[1, 2, 3], ordered=True, dtype='category')
+        """
+        if inplace:
+            raise ValueError("cannot use inplace with CategoricalIndex")
+
+        return CategoricalIndex(
+            self.to_series().cat.set_categories(new_categories, ordered=ordered, rename=rename)
+        ).rename(self.name)
+
 
 def _test() -> None:
     import os
diff --git a/python/pyspark/pandas/missing/indexes.py b/python/pyspark/pandas/missing/indexes.py
index 0f1c316..938aea2 100644
--- a/python/pyspark/pandas/missing/indexes.py
+++ b/python/pyspark/pandas/missing/indexes.py
@@ -123,7 +123,6 @@ class MissingPandasLikeDatetimeIndex(MissingPandasLikeIndex):
 class MissingPandasLikeCategoricalIndex(MissingPandasLikeIndex):
 
     # Functions
-    set_categories = _unsupported_function("set_categories", cls="CategoricalIndex")
     map = _unsupported_function("map", cls="CategoricalIndex")
 
 
diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py
index a11f36a..8368839 100644
--- a/python/pyspark/pandas/tests/indexes/test_category.py
+++ b/python/pyspark/pandas/tests/indexes/test_category.py
@@ -311,6 +311,55 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils):
             lambda: psidx.rename_categories("x"),
         )
 
+    def test_set_categories(self):
+        pidx = pd.CategoricalIndex(["a", "b", "c", "d"])
+        psidx = ps.from_pandas(pidx)
+
+        self.assert_eq(
+            pidx.set_categories(["a", "c", "b", "o"]),
+            psidx.set_categories(["a", "c", "b", "o"]),
+        )
+        self.assert_eq(
+            pidx.set_categories(["a", "c", "b"]),
+            psidx.set_categories(["a", "c", "b"]),
+        )
+        self.assert_eq(
+            pidx.set_categories(["a", "c", "b", "d", "e"]),
+            psidx.set_categories(["a", "c", "b", "d", "e"]),
+        )
+
+        self.assert_eq(
+            pidx.set_categories([0, 1, 3, 2], rename=True),
+            psidx.set_categories([0, 1, 3, 2], rename=True),
+        )
+        self.assert_eq(
+            pidx.set_categories([0, 1, 3], rename=True),
+            psidx.set_categories([0, 1, 3], rename=True),
+        )
+        self.assert_eq(
+            pidx.set_categories([0, 1, 3, 2, 4], rename=True),
+            psidx.set_categories([0, 1, 3, 2, 4], rename=True),
+        )
+
+        self.assert_eq(
+            pidx.set_categories(["a", "c", "b", "o"], ordered=True),
+            psidx.set_categories(["a", "c", "b", "o"], ordered=True),
+        )
+        self.assert_eq(
+            pidx.set_categories(["a", "c", "b"], ordered=True),
+            psidx.set_categories(["a", "c", "b"], ordered=True),
+        )
+        self.assert_eq(
+            pidx.set_categories(["a", "c", "b", "d", "e"], ordered=True),
+            psidx.set_categories(["a", "c", "b", "d", "e"], ordered=True),
+        )
+
+        self.assertRaisesRegex(
+            ValueError,
+            "cannot use inplace with CategoricalIndex",
+            lambda: psidx.set_categories(["a", "c", "b", "o"], inplace=True),
+        )
+
 
 if __name__ == "__main__":
     import unittest
diff --git a/python/pyspark/pandas/tests/test_categorical.py b/python/pyspark/pandas/tests/test_categorical.py
index 4122efa..67cdf3c 100644
--- a/python/pyspark/pandas/tests/test_categorical.py
+++ b/python/pyspark/pandas/tests/test_categorical.py
@@ -668,6 +668,69 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
             lambda: psser.cat.rename_categories("x"),
         )
 
+    def test_set_categories(self):
+        pdf, psdf = self.df_pair
+
+        pser = pdf.b
+        psser = psdf.b
+
+        self.assert_eq(
+            pser.cat.set_categories(["a", "c", "b", "o"]),
+            psser.cat.set_categories(["a", "c", "b", "o"]),
+        )
+        self.assert_eq(
+            pser.cat.set_categories(["a", "c", "b"]),
+            psser.cat.set_categories(["a", "c", "b"]),
+        )
+        self.assert_eq(
+            pser.cat.set_categories(["a", "c", "b", "d", "e"]),
+            psser.cat.set_categories(["a", "c", "b", "d", "e"]),
+        )
+
+        self.assert_eq(
+            pser.cat.set_categories([0, 1, 3, 2], rename=True),
+            psser.cat.set_categories([0, 1, 3, 2], rename=True),
+        )
+        self.assert_eq(
+            pser.cat.set_categories([0, 1, 3], rename=True),
+            psser.cat.set_categories([0, 1, 3], rename=True),
+        )
+        self.assert_eq(
+            pser.cat.set_categories([0, 1, 3, 2, 4], rename=True),
+            psser.cat.set_categories([0, 1, 3, 2, 4], rename=True),
+        )
+
+        self.assert_eq(
+            pser.cat.set_categories(["a", "c", "b", "o"], ordered=True),
+            psser.cat.set_categories(["a", "c", "b", "o"], ordered=True),
+        )
+        self.assert_eq(
+            pser.cat.set_categories(["a", "c", "b"], ordered=True),
+            psser.cat.set_categories(["a", "c", "b"], ordered=True),
+        )
+        self.assert_eq(
+            pser.cat.set_categories(["a", "c", "b", "d", "e"], ordered=True),
+            psser.cat.set_categories(["a", "c", "b", "d", "e"], ordered=True),
+        )
+
+        self.assert_eq(
+            pser.cat.set_categories(["a", "c", "b", "o"], inplace=True, rename=True),
+            psser.cat.set_categories(["a", "c", "b", "o"], inplace=True, rename=True),
+        )
+        self.assert_eq(pser, psser)
+        self.assert_eq(pdf, psdf)
+
+        pser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
+        psser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
+        self.assert_eq(pser, psser)
+        self.assert_eq(pdf, psdf)
+
+        self.assertRaisesRegex(
+            TypeError,
+            "Parameter 'new_categories' must be list-like, was",
+            lambda: psser.cat.set_categories(None),
+        )
+
 
 if __name__ == "__main__":
     import unittest

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org