You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ue...@apache.org on 2021/07/21 01:25:26 UTC

[spark] branch branch-3.2 updated: [SPARK-36186][PYTHON] Add as_ordered/as_unordered to CategoricalAccessor and CategoricalIndex

This is an automated email from the ASF dual-hosted git repository.

ueshin pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
     new a3a13da  [SPARK-36186][PYTHON] Add as_ordered/as_unordered to CategoricalAccessor and CategoricalIndex
a3a13da is described below

commit a3a13da26c19b4241bf2f76273a82fe8598eddf5
Author: Takuya UESHIN <ue...@databricks.com>
AuthorDate: Tue Jul 20 18:23:54 2021 -0700

    [SPARK-36186][PYTHON] Add as_ordered/as_unordered to CategoricalAccessor and CategoricalIndex
    
    ### What changes were proposed in this pull request?
    
    Add `as_ordered`/`as_unordered` to `CategoricalAccessor` and `CategoricalIndex`.
    
    ### Why are the changes needed?
    
    We should implement `as_ordered`/`as_unordered` in `CategoricalAccessor` and `CategoricalIndex` yet.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, users will be able to use `as_ordered`/`as_unordered`.
    
    ### How was this patch tested?
    
    Added some tests.
    
    Closes #33400 from ueshin/issues/SPARK-36186/as_ordered_unordered.
    
    Authored-by: Takuya UESHIN <ue...@databricks.com>
    Signed-off-by: Takuya UESHIN <ue...@databricks.com>
    (cherry picked from commit 376fadc89cffac97aebe49a7cf4a4bc978b1d09e)
    Signed-off-by: Takuya UESHIN <ue...@databricks.com>
---
 .../source/reference/pyspark.pandas/indexing.rst   |   2 +
 .../source/reference/pyspark.pandas/series.rst     |   2 +
 python/pyspark/pandas/categorical.py               | 116 +++++++++++++++++++--
 python/pyspark/pandas/indexes/category.py          |  72 ++++++++++++-
 python/pyspark/pandas/missing/indexes.py           |   2 -
 .../pyspark/pandas/tests/indexes/test_category.py  |  10 ++
 python/pyspark/pandas/tests/test_categorical.py    |  22 ++++
 7 files changed, 214 insertions(+), 12 deletions(-)

diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst
index e91f699..4f84d91 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -175,6 +175,8 @@ Categorical components
    CategoricalIndex.codes
    CategoricalIndex.categories
    CategoricalIndex.ordered
+   CategoricalIndex.as_ordered
+   CategoricalIndex.as_unordered
 
 .. _api.multiindex:
 
diff --git a/python/docs/source/reference/pyspark.pandas/series.rst b/python/docs/source/reference/pyspark.pandas/series.rst
index a199d70..b718d79 100644
--- a/python/docs/source/reference/pyspark.pandas/series.rst
+++ b/python/docs/source/reference/pyspark.pandas/series.rst
@@ -401,6 +401,8 @@ the ``Series.cat`` accessor.
    Series.cat.categories
    Series.cat.ordered
    Series.cat.codes
+   Series.cat.as_ordered
+   Series.cat.as_unordered
 
 .. _api.series.plot:
 
diff --git a/python/pyspark/pandas/categorical.py b/python/pyspark/pandas/categorical.py
index 3495b35..b8cc88c 100644
--- a/python/pyspark/pandas/categorical.py
+++ b/python/pyspark/pandas/categorical.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from typing import TYPE_CHECKING, cast
+from typing import Optional, TYPE_CHECKING, cast
 
 import pandas as pd
 from pandas.api.types import CategoricalDtype
@@ -62,6 +62,10 @@ class CategoricalAccessor(object):
         self._data = series
 
     @property
+    def _dtype(self) -> CategoricalDtype:
+        return cast(CategoricalDtype, self._data.dtype)
+
+    @property
     def categories(self) -> pd.Index:
         """
         The categories of this categorical.
@@ -82,7 +86,7 @@ class CategoricalAccessor(object):
         >>> s.cat.categories
         Index(['a', 'b', 'c'], dtype='object')
         """
-        return cast(CategoricalDtype, self._data.dtype).categories
+        return self._dtype.categories
 
     @categories.setter
     def categories(self, categories: pd.Index) -> None:
@@ -109,7 +113,7 @@ class CategoricalAccessor(object):
         >>> s.cat.ordered
         False
         """
-        return cast(CategoricalDtype, self._data.dtype).ordered
+        return self._dtype.ordered
 
     @property
     def codes(self) -> "ps.Series":
@@ -152,11 +156,109 @@ class CategoricalAccessor(object):
     def add_categories(self, new_categories: pd.Index, inplace: bool = False) -> "ps.Series":
         raise NotImplementedError()
 
-    def as_ordered(self, inplace: bool = False) -> "ps.Series":
-        raise NotImplementedError()
+    def _set_ordered(self, *, ordered: bool, inplace: bool) -> Optional["ps.Series"]:
+        from pyspark.pandas.frame import DataFrame
+
+        if self.ordered == ordered:
+            if inplace:
+                return None
+            else:
+                psser = self._data
+        else:
+            internal = self._data._psdf._internal.with_new_spark_column(
+                self._data._column_label,
+                self._data.spark.column,
+                field=self._data._internal.data_fields[0].copy(
+                    dtype=CategoricalDtype(categories=self.categories, ordered=ordered)
+                ),
+            )
+            if inplace:
+                self._data._psdf._update_internal_frame(internal)
+                return None
+            else:
+                psser = DataFrame(internal)._psser_for(self._data._column_label)
+
+        return psser._with_new_scol(psser.spark.column, field=psser._internal.data_fields[0])
+
+    def as_ordered(self, inplace: bool = False) -> Optional["ps.Series"]:
+        """
+        Set the Categorical to be ordered.
 
-    def as_unordered(self, inplace: bool = False) -> "ps.Series":
-        raise NotImplementedError()
+        Parameters
+        ----------
+        inplace : bool, default False
+           Whether or not to set the ordered attribute in-place or return
+           a copy of this categorical with ordered set to True.
+
+        Returns
+        -------
+        Series or None
+            Ordered Categorical or None if ``inplace=True``.
+
+        Examples
+        --------
+        >>> s = ps.Series(list("abbccc"), dtype="category")
+        >>> s  # doctest: +SKIP
+        0    a
+        1    b
+        2    b
+        3    c
+        4    c
+        5    c
+        dtype: category
+        Categories (3, object): ['a', 'b', 'c']
+
+        >>> s.cat.as_ordered()  # doctest: +SKIP
+        0    a
+        1    b
+        2    b
+        3    c
+        4    c
+        5    c
+        dtype: category
+        Categories (3, object): ['a' < 'b' < 'c']
+        """
+        return self._set_ordered(ordered=True, inplace=inplace)
+
+    def as_unordered(self, inplace: bool = False) -> Optional["ps.Series"]:
+        """
+        Set the Categorical to be unordered.
+
+        Parameters
+        ----------
+        inplace : bool, default False
+           Whether or not to set the ordered attribute in-place or return
+           a copy of this categorical with ordered set to False.
+
+        Returns
+        -------
+        Series or None
+            Unordered Categorical or None if ``inplace=True``.
+
+        Examples
+        --------
+        >>> s = ps.Series(list("abbccc"), dtype="category").cat.as_ordered()
+        >>> s  # doctest: +SKIP
+        0    a
+        1    b
+        2    b
+        3    c
+        4    c
+        5    c
+        dtype: category
+        Categories (3, object): ['a' < 'b' < 'c']
+
+        >>> s.cat.as_unordered()  # doctest: +SKIP
+        0    a
+        1    b
+        2    b
+        3    c
+        4    c
+        5    c
+        dtype: category
+        Categories (3, object): ['a', 'b', 'c']
+        """
+        return self._set_ordered(ordered=False, inplace=inplace)
 
     def remove_categories(self, removals: pd.Index, inplace: bool = False) -> "ps.Series":
         raise NotImplementedError()
diff --git a/python/pyspark/pandas/indexes/category.py b/python/pyspark/pandas/indexes/category.py
index 2f8ad17..a7ad2a0 100644
--- a/python/pyspark/pandas/indexes/category.py
+++ b/python/pyspark/pandas/indexes/category.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 from functools import partial
-from typing import Any, no_type_check, cast
+from typing import Any, Optional, cast, no_type_check
 
 import pandas as pd
 from pandas.api.types import is_hashable, CategoricalDtype
@@ -116,6 +116,10 @@ class CategoricalIndex(Index):
         )
 
     @property
+    def dtype(self) -> CategoricalDtype:
+        return cast(CategoricalDtype, super().dtype)
+
+    @property
     def codes(self) -> Index:
         """
         The category codes of this categorical.
@@ -167,7 +171,7 @@ class CategoricalIndex(Index):
         >>> idx.categories
         Index(['a', 'b', 'c'], dtype='object')
         """
-        return cast(CategoricalDtype, self.dtype).categories
+        return self.dtype.categories
 
     @categories.setter
     def categories(self, categories: pd.Index) -> None:
@@ -188,7 +192,69 @@ class CategoricalIndex(Index):
         >>> idx.ordered
         False
         """
-        return cast(CategoricalDtype, self.dtype).ordered
+        return self.dtype.ordered
+
+    def as_ordered(self, inplace: bool = False) -> Optional["CategoricalIndex"]:
+        """
+        Set the Categorical to be ordered.
+
+        Parameters
+        ----------
+        inplace : bool, default False
+           Whether or not to set the ordered attribute in-place or return
+           a copy of this categorical with ordered set to True.
+
+        Returns
+        -------
+        CategoricalIndex or None
+            Ordered Categorical or None if ``inplace=True``.
+
+        Examples
+        --------
+        >>> idx = ps.CategoricalIndex(list("abbccc"))
+        >>> idx  # doctest: +NORMALIZE_WHITESPACE
+        CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+                         categories=['a', 'b', 'c'], ordered=False, dtype='category')
+
+        >>> idx.as_ordered()  # doctest: +NORMALIZE_WHITESPACE
+        CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+                         categories=['a', 'b', 'c'], ordered=True, dtype='category')
+        """
+        if inplace:
+            raise ValueError("cannot use inplace with CategoricalIndex")
+
+        return CategoricalIndex(self.to_series().cat.as_ordered()).rename(self.name)
+
+    def as_unordered(self, inplace: bool = False) -> Optional["CategoricalIndex"]:
+        """
+        Set the Categorical to be unordered.
+
+        Parameters
+        ----------
+        inplace : bool, default False
+           Whether or not to set the ordered attribute in-place or return
+           a copy of this categorical with ordered set to False.
+
+        Returns
+        -------
+        CategoricalIndex or None
+            Unordered Categorical or None if ``inplace=True``.
+
+        Examples
+        --------
+        >>> idx = ps.CategoricalIndex(list("abbccc")).as_ordered()
+        >>> idx  # doctest: +NORMALIZE_WHITESPACE
+        CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+                         categories=['a', 'b', 'c'], ordered=True, dtype='category')
+
+        >>> idx.as_unordered()  # doctest: +NORMALIZE_WHITESPACE
+        CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+                         categories=['a', 'b', 'c'], ordered=False, dtype='category')
+        """
+        if inplace:
+            raise ValueError("cannot use inplace with CategoricalIndex")
+
+        return CategoricalIndex(self.to_series().cat.as_unordered()).rename(self.name)
 
     def __getattr__(self, item: str) -> Any:
         if hasattr(MissingPandasLikeCategoricalIndex, item):
diff --git a/python/pyspark/pandas/missing/indexes.py b/python/pyspark/pandas/missing/indexes.py
index 0b8a975..6ca564f 100644
--- a/python/pyspark/pandas/missing/indexes.py
+++ b/python/pyspark/pandas/missing/indexes.py
@@ -131,8 +131,6 @@ class MissingPandasLikeCategoricalIndex(MissingPandasLikeIndex):
         "remove_unused_categories", cls="CategoricalIndex"
     )
     set_categories = _unsupported_function("set_categories", cls="CategoricalIndex")
-    as_ordered = _unsupported_function("as_ordered", cls="CategoricalIndex")
-    as_unordered = _unsupported_function("as_unordered", cls="CategoricalIndex")
     map = _unsupported_function("map", cls="CategoricalIndex")
 
 
diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py
index fb72b30..02752ec 100644
--- a/python/pyspark/pandas/tests/indexes/test_category.py
+++ b/python/pyspark/pandas/tests/indexes/test_category.py
@@ -67,6 +67,16 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils):
         self.assert_eq(psidx.codes, pd.Index(pidx.codes))
         self.assert_eq(psidx.ordered, pidx.ordered)
 
+    def test_as_ordered_unordered(self):
+        pidx = pd.CategoricalIndex(["x", "y", "z"], categories=["z", "y", "x"])
+        psidx = ps.from_pandas(pidx)
+
+        self.assert_eq(pidx.as_ordered(), psidx.as_ordered())
+        self.assert_eq(pidx.as_unordered(), psidx.as_unordered())
+
+        self.assertRaises(ValueError, lambda: psidx.as_ordered(inplace=True))
+        self.assertRaises(ValueError, lambda: psidx.as_unordered(inplace=True))
+
     def test_astype(self):
         pidx = pd.Index(["a", "b", "c"])
         psidx = ps.from_pandas(pidx)
diff --git a/python/pyspark/pandas/tests/test_categorical.py b/python/pyspark/pandas/tests/test_categorical.py
index 7a1f7be..a4c9b148 100644
--- a/python/pyspark/pandas/tests/test_categorical.py
+++ b/python/pyspark/pandas/tests/test_categorical.py
@@ -65,6 +65,28 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
         self.assert_eq(psser.cat.codes, pser.cat.codes)
         self.assert_eq(psser.cat.ordered, pser.cat.ordered)
 
+    def test_as_ordered_unordered(self):
+        pdf, psdf = self.df_pair
+
+        pser = pdf.a
+        psser = psdf.a
+
+        # as_ordered
+        self.assert_eq(pser.cat.as_ordered(), psser.cat.as_ordered())
+
+        pser.cat.as_ordered(inplace=True)
+        psser.cat.as_ordered(inplace=True)
+        self.assert_eq(pser, psser)
+        self.assert_eq(pdf, psdf)
+
+        # as_unordered
+        self.assert_eq(pser.cat.as_unordered(), psser.cat.as_unordered())
+
+        pser.cat.as_unordered(inplace=True)
+        psser.cat.as_unordered(inplace=True)
+        self.assert_eq(pser, psser)
+        self.assert_eq(pdf, psdf)
+
     def test_astype(self):
         pser = pd.Series(["a", "b", "c"])
         psser = ps.from_pandas(pser)

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org