You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ue...@apache.org on 2021/07/14 21:01:55 UTC

[spark] branch master updated: [SPARK-36125][PYTHON] Implement non-equality comparison operators between two Categoricals

This is an automated email from the ASF dual-hosted git repository.

ueshin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 0cb120f  [SPARK-36125][PYTHON] Implement non-equality comparison operators between two Categoricals
0cb120f is described below

commit 0cb120f390cac96c09ae99c8fbaec2ac06cd2848
Author: Xinrong Meng <xi...@databricks.com>
AuthorDate: Wed Jul 14 14:01:10 2021 -0700

    [SPARK-36125][PYTHON] Implement non-equality comparison operators between two Categoricals
    
    ### What changes were proposed in this pull request?
    Implement non-equality comparison operators between two Categoricals.
    Non-goal: supporting Scalar input will be a follow-up task.
    
    ### Why are the changes needed?
    pandas supports non-equality comparisons between two Categoricals. We should follow that.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes. No `NotImplementedError` for `<`, `<=`, `>`, `>=` operators between two Categoricals. An example is shown as below:
    
    From:
    ```py
    >>> import pyspark.pandas as ps
    >>> from pandas.api.types import CategoricalDtype
    >>> psser = ps.Series([1, 2, 3]).astype(CategoricalDtype([3, 2, 1], ordered=True))
    >>> other_psser = ps.Series([2, 1, 3]).astype(CategoricalDtype([3, 2, 1], ordered=True))
    >>> with ps.option_context("compute.ops_on_diff_frames", True):
    ...     psser <= other_psser
    ...
    Traceback (most recent call last):
    ...
    NotImplementedError: <= can not be applied to categoricals.
    ```
    
    To:
    ```py
    >>> import pyspark.pandas as ps
    >>> from pandas.api.types import CategoricalDtype
    >>> psser = ps.Series([1, 2, 3]).astype(CategoricalDtype([3, 2, 1], ordered=True))
    >>> other_psser = ps.Series([2, 1, 3]).astype(CategoricalDtype([3, 2, 1], ordered=True))
    >>> with ps.option_context("compute.ops_on_diff_frames", True):
    ...     psser <= other_psser
    ...
    0    False
    1     True
    2     True
    dtype: bool
    ```
    ### How was this patch tested?
    Unit tests.
    
    Closes #33331 from xinrong-databricks/categorical_compare.
    
    Authored-by: Xinrong Meng <xi...@databricks.com>
    Signed-off-by: Takuya UESHIN <ue...@databricks.com>
---
 .../pandas/data_type_ops/categorical_ops.py        |  27 +++-
 .../tests/data_type_ops/test_categorical_ops.py    | 149 ++++++++++++++++++++-
 2 files changed, 166 insertions(+), 10 deletions(-)

diff --git a/python/pyspark/pandas/data_type_ops/categorical_ops.py b/python/pyspark/pandas/data_type_ops/categorical_ops.py
index 9238e6b..fb5666d 100644
--- a/python/pyspark/pandas/data_type_ops/categorical_ops.py
+++ b/python/pyspark/pandas/data_type_ops/categorical_ops.py
@@ -22,10 +22,12 @@ import pandas as pd
 from pandas.api.types import CategoricalDtype
 
 from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
+from pyspark.pandas.base import column_op, IndexOpsMixin
 from pyspark.pandas.data_type_ops.base import DataTypeOps
 from pyspark.pandas.spark import functions as SF
 from pyspark.pandas.typedef import pandas_on_spark_type
 from pyspark.sql import functions as F
+from pyspark.sql.column import Column
 
 
 class CategoricalOps(DataTypeOps):
@@ -64,15 +66,28 @@ class CategoricalOps(DataTypeOps):
             scol = map_scol.getItem(index_ops.spark.column)
         return index_ops._with_new_scol(scol).astype(dtype)
 
-    # TODO(SPARK-35997): Implement comparison operators below
     def lt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
-        raise NotImplementedError("< can not be applied to %s." % self.pretty_name)
+        _non_equality_comparison_input_check(left, right)
+        return column_op(Column.__lt__)(left, right)
 
     def le(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
-        raise NotImplementedError("<= can not be applied to %s." % self.pretty_name)
+        _non_equality_comparison_input_check(left, right)
+        return column_op(Column.__le__)(left, right)
+
+    def gt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
+        _non_equality_comparison_input_check(left, right)
+        return column_op(Column.__gt__)(left, right)
 
     def ge(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
-        raise NotImplementedError("> can not be applied to %s." % self.pretty_name)
+        _non_equality_comparison_input_check(left, right)
+        return column_op(Column.__ge__)(left, right)
 
-    def gt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
-        raise NotImplementedError(">= can not be applied to %s." % self.pretty_name)
+
+def _non_equality_comparison_input_check(left: IndexOpsLike, right: Any) -> None:
+    if not left.dtype.ordered:
+        raise TypeError("Unordered Categoricals can only compare equality or not.")
+    if isinstance(right, IndexOpsMixin) and isinstance(right.dtype, CategoricalDtype):
+        if hash(left.dtype) != hash(right.dtype):
+            raise TypeError("Categoricals can only be compared if 'categories' are the same.")
+    else:
+        raise TypeError("Cannot compare a Categorical with the given type.")
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py
index c0fb240..840722c 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py
@@ -44,6 +44,26 @@ class CategoricalOpsTest(PandasOnSparkTestCase, TestCasesUtils):
     def other_psser(self):
         return ps.from_pandas(self.other_pser)
 
+    @property
+    def ordered_pser(self):
+        return pd.Series([1, 2, 3]).astype(CategoricalDtype([3, 2, 1], ordered=True))
+
+    @property
+    def ordered_psser(self):
+        return ps.from_pandas(self.ordered_pser)
+
+    @property
+    def other_ordered_pser(self):
+        return pd.Series([2, 1, 3]).astype(CategoricalDtype([3, 2, 1], ordered=True))
+
+    @property
+    def other_ordered_psser(self):
+        return ps.from_pandas(self.other_ordered_pser)
+
+    @property
+    def unordered_psser(self):
+        return ps.Series([1, 2, 3]).astype(CategoricalDtype([3, 2, 1]))
+
     def test_add(self):
         self.assertRaises(TypeError, lambda: self.psser + "x")
         self.assertRaises(TypeError, lambda: self.psser + 1)
@@ -198,16 +218,137 @@ class CategoricalOpsTest(PandasOnSparkTestCase, TestCasesUtils):
             self.assert_eq(self.pser != self.pser, (self.psser != self.psser).sort_index())
 
     def test_lt(self):
-        self.assertRaises(NotImplementedError, lambda: self.psser < self.other_psser)
+        ordered_pser = self.ordered_pser
+        ordered_psser = self.ordered_psser
+        self.assert_eq(ordered_pser < ordered_pser, ordered_psser < ordered_psser)
+        with option_context("compute.ops_on_diff_frames", True):
+            self.assert_eq(
+                ordered_pser < self.other_ordered_pser, ordered_psser < self.other_ordered_psser
+            )
+            self.assertRaisesRegex(
+                TypeError,
+                "Unordered Categoricals can only compare equality or not",
+                lambda: self.unordered_psser < ordered_psser,
+            )
+            self.assertRaisesRegex(
+                TypeError,
+                "Categoricals can only be compared if 'categories' are the same",
+                lambda: ordered_psser < self.unordered_psser,
+            )
+            self.assertRaisesRegex(
+                TypeError,
+                "Cannot compare a Categorical with the given type",
+                lambda: ordered_psser < ps.Series([1, 2, 3]),
+            )
+        self.assertRaisesRegex(
+            TypeError,
+            "Cannot compare a Categorical with the given type",
+            lambda: ordered_psser < [1, 2, 3],
+        )
+        self.assertRaisesRegex(
+            TypeError, "Cannot compare a Categorical with the given type", lambda: ordered_psser < 1
+        )
 
     def test_le(self):
-        self.assertRaises(NotImplementedError, lambda: self.psser <= self.other_psser)
+        ordered_pser = self.ordered_pser
+        ordered_psser = self.ordered_psser
+        self.assert_eq(ordered_pser <= ordered_pser, ordered_psser <= ordered_psser)
+
+        with option_context("compute.ops_on_diff_frames", True):
+            self.assert_eq(
+                ordered_pser <= self.other_ordered_pser, ordered_psser <= self.other_ordered_psser
+            )
+            self.assertRaisesRegex(
+                TypeError,
+                "Unordered Categoricals can only compare equality or not",
+                lambda: self.unordered_psser <= ordered_psser,
+            )
+            self.assertRaisesRegex(
+                TypeError,
+                "Categoricals can only be compared if 'categories' are the same",
+                lambda: ordered_psser <= self.unordered_psser,
+            )
+            self.assertRaisesRegex(
+                TypeError,
+                "Cannot compare a Categorical with the given type",
+                lambda: ordered_psser <= ps.Series([1, 2, 3]),
+            )
+        self.assertRaisesRegex(
+            TypeError,
+            "Cannot compare a Categorical with the given type",
+            lambda: ordered_psser <= [1, 2, 3],
+        )
+        self.assertRaisesRegex(
+            TypeError,
+            "Cannot compare a Categorical with the given type",
+            lambda: ordered_psser <= 1,
+        )
 
     def test_gt(self):
-        self.assertRaises(NotImplementedError, lambda: self.psser > self.other_psser)
+        ordered_pser = self.ordered_pser
+        ordered_psser = self.ordered_psser
+        self.assert_eq(ordered_pser > ordered_pser, ordered_psser > ordered_psser)
+        with option_context("compute.ops_on_diff_frames", True):
+            self.assert_eq(
+                ordered_pser > self.other_ordered_pser, ordered_psser > self.other_ordered_psser
+            )
+            self.assertRaisesRegex(
+                TypeError,
+                "Unordered Categoricals can only compare equality or not",
+                lambda: self.unordered_psser > ordered_psser,
+            )
+            self.assertRaisesRegex(
+                TypeError,
+                "Categoricals can only be compared if 'categories' are the same",
+                lambda: ordered_psser > self.unordered_psser,
+            )
+            self.assertRaisesRegex(
+                TypeError,
+                "Cannot compare a Categorical with the given type",
+                lambda: ordered_psser > ps.Series([1, 2, 3]),
+            )
+        self.assertRaisesRegex(
+            TypeError,
+            "Cannot compare a Categorical with the given type",
+            lambda: ordered_psser > [1, 2, 3],
+        )
+        self.assertRaisesRegex(
+            TypeError, "Cannot compare a Categorical with the given type", lambda: ordered_psser > 1
+        )
 
     def test_ge(self):
-        self.assertRaises(NotImplementedError, lambda: self.psser >= self.other_psser)
+        ordered_pser = self.ordered_pser
+        ordered_psser = self.ordered_psser
+        self.assert_eq(ordered_pser >= ordered_pser, ordered_psser >= ordered_psser)
+        with option_context("compute.ops_on_diff_frames", True):
+            self.assert_eq(
+                ordered_pser >= self.other_ordered_pser, ordered_psser >= self.other_ordered_psser
+            )
+            self.assertRaisesRegex(
+                TypeError,
+                "Unordered Categoricals can only compare equality or not",
+                lambda: self.unordered_psser >= ordered_psser,
+            )
+            self.assertRaisesRegex(
+                TypeError,
+                "Categoricals can only be compared if 'categories' are the same",
+                lambda: ordered_psser >= self.unordered_psser,
+            )
+            self.assertRaisesRegex(
+                TypeError,
+                "Cannot compare a Categorical with the given type",
+                lambda: ordered_psser >= ps.Series([1, 2, 3]),
+            )
+        self.assertRaisesRegex(
+            TypeError,
+            "Cannot compare a Categorical with the given type",
+            lambda: ordered_psser >= [1, 2, 3],
+        )
+        self.assertRaisesRegex(
+            TypeError,
+            "Cannot compare a Categorical with the given type",
+            lambda: ordered_psser >= 1,
+        )
 
 
 if __name__ == "__main__":

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org