You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ru...@apache.org on 2023/12/31 09:34:37 UTC

(spark) branch master updated: [SPARK-46504][PS][TESTS][FOLLOWUPS] Moving move slow tests out of `IndexesTests`

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new ae9106370a9f [SPARK-46504][PS][TESTS][FOLLOWUPS] Moving move slow tests out of `IndexesTests`
ae9106370a9f is described below

commit ae9106370a9f3002979077662149ba9122d86cef
Author: Ruifeng Zheng <ru...@apache.org>
AuthorDate: Sun Dec 31 17:34:17 2023 +0800

    [SPARK-46504][PS][TESTS][FOLLOWUPS] Moving move slow tests out of `IndexesTests`
    
    ### What changes were proposed in this pull request?
    Moving move slow tests out of `IndexesTests`
    
    ### Why are the changes needed?
    for testing parallelism
    
    ### Does this PR introduce _any_ user-facing change?
    no
    
    ### How was this patch tested?
    ci
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #44543 from zhengruifeng/ps_test_idx_base_sort_take.
    
    Authored-by: Ruifeng Zheng <ru...@apache.org>
    Signed-off-by: Ruifeng Zheng <ru...@apache.org>
---
 dev/sparktestsupport/modules.py                    |   8 +
 .../tests/connect/indexes/test_parity_sort.py      |  41 +++
 .../connect/indexes/test_parity_symmetric_diff.py  |  41 +++
 .../tests/connect/indexes/test_parity_take.py      |  41 +++
 .../tests/connect/indexes/test_parity_unique.py    |  41 +++
 python/pyspark/pandas/tests/indexes/test_base.py   | 280 ---------------------
 python/pyspark/pandas/tests/indexes/test_sort.py   |  99 ++++++++
 .../pandas/tests/indexes/test_symmetric_diff.py    | 125 +++++++++
 python/pyspark/pandas/tests/indexes/test_take.py   |  84 +++++++
 python/pyspark/pandas/tests/indexes/test_unique.py | 164 ++++++++++++
 10 files changed, 644 insertions(+), 280 deletions(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 7f0c31201c84..6aca31e5efdf 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -798,6 +798,10 @@ pyspark_pandas_slow = Module(
         "pyspark.pandas.tests.indexes.test_base",
         "pyspark.pandas.tests.indexes.test_conversion",
         "pyspark.pandas.tests.indexes.test_drop",
+        "pyspark.pandas.tests.indexes.test_sort",
+        "pyspark.pandas.tests.indexes.test_symmetric_diff",
+        "pyspark.pandas.tests.indexes.test_take",
+        "pyspark.pandas.tests.indexes.test_unique",
         "pyspark.pandas.tests.indexes.test_asof",
         "pyspark.pandas.tests.indexes.test_astype",
         "pyspark.pandas.tests.indexes.test_delete",
@@ -1090,6 +1094,10 @@ pyspark_pandas_connect_part0 = Module(
         "pyspark.pandas.tests.connect.indexes.test_parity_base",
         "pyspark.pandas.tests.connect.indexes.test_parity_conversion",
         "pyspark.pandas.tests.connect.indexes.test_parity_drop",
+        "pyspark.pandas.tests.connect.indexes.test_parity_sort",
+        "pyspark.pandas.tests.connect.indexes.test_parity_symmetric_diff",
+        "pyspark.pandas.tests.connect.indexes.test_parity_take",
+        "pyspark.pandas.tests.connect.indexes.test_parity_unique",
         "pyspark.pandas.tests.connect.indexes.test_parity_asof",
         "pyspark.pandas.tests.connect.indexes.test_parity_astype",
         "pyspark.pandas.tests.connect.indexes.test_parity_delete",
diff --git a/python/pyspark/pandas/tests/connect/indexes/test_parity_sort.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_sort.py
new file mode 100644
index 000000000000..1affd5c3d777
--- /dev/null
+++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_sort.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.indexes.test_sort import IndexesSortMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+
+
+class IndexesSortParityTests(
+    IndexesSortMixin,
+    PandasOnSparkTestUtils,
+    ReusedConnectTestCase,
+):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.connect.indexes.test_parity_sort import *  # noqa: F401
+
+    try:
+        import xmlrunner  # type: ignore[import]
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/connect/indexes/test_parity_symmetric_diff.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_symmetric_diff.py
new file mode 100644
index 000000000000..825167baffa8
--- /dev/null
+++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_symmetric_diff.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.indexes.test_symmetric_diff import IndexesSymmetricDiffMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+
+
+class IndexesSymmetricDiffParityTests(
+    IndexesSymmetricDiffMixin,
+    PandasOnSparkTestUtils,
+    ReusedConnectTestCase,
+):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.connect.indexes.test_parity_symmetric_diff import *  # noqa: F401
+
+    try:
+        import xmlrunner  # type: ignore[import]
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/connect/indexes/test_parity_take.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_take.py
new file mode 100644
index 000000000000..34151a4a8fb6
--- /dev/null
+++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_take.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.indexes.test_take import IndexesTakeMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+
+
+class IndexesTakeParityTests(
+    IndexesTakeMixin,
+    PandasOnSparkTestUtils,
+    ReusedConnectTestCase,
+):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.connect.indexes.test_parity_take import *  # noqa: F401
+
+    try:
+        import xmlrunner  # type: ignore[import]
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/connect/indexes/test_parity_unique.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_unique.py
new file mode 100644
index 000000000000..d303b8f969db
--- /dev/null
+++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_unique.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.indexes.test_unique import UniqueMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+
+
+class IndexesUniqueParityTests(
+    UniqueMixin,
+    PandasOnSparkTestUtils,
+    ReusedConnectTestCase,
+):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.connect.indexes.test_parity_unique import *  # noqa: F401
+
+    try:
+        import xmlrunner  # type: ignore[import]
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py
index 788319a38c2d..f7e6c553ac15 100644
--- a/python/pyspark/pandas/tests/indexes/test_base.py
+++ b/python/pyspark/pandas/tests/indexes/test_base.py
@@ -235,24 +235,6 @@ class IndexesTestsMixin:
         psidx = ps.from_pandas(pidx)
         self.assertEqual(pidx.levshape, psidx.levshape)
 
-    def test_index_unique(self):
-        psidx = self.psdf.index
-
-        # here the output is different than pandas in terms of order
-        expected = [0, 1, 3, 5, 6, 8, 9]
-
-        self.assert_eq(expected, sorted(psidx.unique()._to_pandas()))
-        self.assert_eq(expected, sorted(psidx.unique(level=0)._to_pandas()))
-
-        expected = [1, 2, 4, 6, 7, 9, 10]
-        self.assert_eq(expected, sorted((psidx + 1).unique()._to_pandas()))
-
-        with self.assertRaisesRegex(IndexError, "Too many levels*"):
-            psidx.unique(level=1)
-
-        with self.assertRaisesRegex(KeyError, "Requested level (hi)*"):
-            psidx.unique(level="hi")
-
     def test_multi_index_copy(self):
         arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
         idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
@@ -261,85 +243,6 @@ class IndexesTestsMixin:
 
         self.assert_eq(psdf.index.copy(), pdf.index.copy())
 
-    def test_index_symmetric_difference(self):
-        pidx1 = pd.Index([1, 2, 3, 4])
-        pidx2 = pd.Index([2, 3, 4, 5])
-        psidx1 = ps.from_pandas(pidx1)
-        psidx2 = ps.from_pandas(pidx2)
-
-        self.assert_eq(
-            psidx1.symmetric_difference(psidx2).sort_values(),
-            pidx1.symmetric_difference(pidx2).sort_values(),
-        )
-        self.assert_eq(
-            (psidx1 + 1).symmetric_difference(psidx2).sort_values(),
-            (pidx1 + 1).symmetric_difference(pidx2).sort_values(),
-        )
-        # No longer supported from pandas 2.0.0.
-        if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
-            self.assert_eq(
-                (psidx1 ^ psidx2).sort_values(),
-                ps.Index([1, 5], dtype="int64"),
-            )
-        else:
-            self.assert_eq(
-                (psidx1 ^ psidx2).sort_values(),
-                (pidx1 ^ pidx2).sort_values(),
-            )
-        self.assert_eq(
-            psidx1.symmetric_difference(psidx2, result_name="result").sort_values(),
-            pidx1.symmetric_difference(pidx2, result_name="result").sort_values(),
-        )
-
-        pmidx1 = pd.MultiIndex(
-            [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
-            [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]],
-        )
-        pmidx2 = pd.MultiIndex(
-            [["koalas", "cow", "falcon"], ["speed", "weight", "length"]],
-            [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]],
-        )
-        psmidx1 = ps.from_pandas(pmidx1)
-        psmidx2 = ps.from_pandas(pmidx2)
-
-        self.assert_eq(
-            psmidx1.symmetric_difference(psmidx2).sort_values(),
-            pmidx1.symmetric_difference(pmidx2).sort_values(),
-        )
-
-        # Pandas has a bug that raise TypeError when setting `result_name` for MultiIndex.
-        pandas_result = pmidx1.symmetric_difference(pmidx2)
-        pandas_result.names = ["a", "b"]
-        self.assert_eq(
-            psmidx1.symmetric_difference(psmidx2, result_name=["a", "b"]).sort_values(),
-            pandas_result,
-        )
-
-        # Pandas sort the result by default, so doesn't provide the `True` for sort.
-        self.assert_eq(
-            psmidx1.symmetric_difference(psmidx2, sort=True),
-            pmidx1.symmetric_difference(pmidx2),
-        )
-
-        idx = ps.Index(["a", "b", "c"])
-        midx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
-
-        with self.assertRaisesRegex(NotImplementedError, "Doesn't support*"):
-            idx.symmetric_difference(midx)
-
-    def test_multi_index_symmetric_difference(self):
-        idx = ps.Index(["a", "b", "c"])
-        midx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
-        midx_ = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
-
-        self.assert_eq(
-            midx.symmetric_difference(midx_),
-            midx._to_pandas().symmetric_difference(midx_._to_pandas()),
-        )
-
-        with self.assertRaisesRegex(NotImplementedError, "Doesn't support*"):
-            midx.symmetric_difference(idx)
-
     def test_missing(self):
         psdf = ps.DataFrame(
             {
@@ -521,32 +424,6 @@ class IndexesTestsMixin:
             ):
                 getattr(psdf.set_index("c").index, name)
 
-    def test_index_has_duplicates(self):
-        indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
-        names = [None, "ks", "ks", None]
-        has_dup = [False, True, True, False]
-
-        for idx, name, expected in zip(indexes, names, has_dup):
-            pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name))
-            psdf = ps.from_pandas(pdf)
-
-            self.assertEqual(psdf.index.has_duplicates, expected)
-
-    def test_multiindex_has_duplicates(self):
-        indexes = [
-            [list("abc"), list("edf")],
-            [list("aac"), list("edf")],
-            [list("aac"), list("eef")],
-            [[1, 4, 4], [4, 6, 6]],
-        ]
-        has_dup = [False, False, True, True]
-
-        for idx, expected in zip(indexes, has_dup):
-            pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx)
-            psdf = ps.from_pandas(pdf)
-
-            self.assertEqual(psdf.index.has_duplicates, expected)
-
     def test_multi_index_not_supported(self):
         psdf = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
 
@@ -602,60 +479,6 @@ class IndexesTestsMixin:
         with self.assertRaisesRegex(TypeError, "Unsupported type list"):
             psidx.fillna([1, 2])
 
-    def _test_sort_values(self, pidx, psidx):
-        self.assert_eq(pidx.sort_values(), psidx.sort_values())
-        # Parameter ascending
-        self.assert_eq(pidx.sort_values(ascending=False), psidx.sort_values(ascending=False))
-        # Parameter return_indexer
-        p_sorted, p_indexer = pidx.sort_values(return_indexer=True)
-        ps_sorted, ps_indexer = psidx.sort_values(return_indexer=True)
-        self.assert_eq(p_sorted, ps_sorted)
-        self.assert_eq(p_indexer, ps_indexer.to_list())
-        self.assert_eq(
-            pidx.sort_values(return_indexer=False), psidx.sort_values(return_indexer=False)
-        )
-        # Parameter return_indexer and ascending
-        p_sorted, p_indexer = pidx.sort_values(return_indexer=True, ascending=False)
-        ps_sorted, ps_indexer = psidx.sort_values(return_indexer=True, ascending=False)
-        self.assert_eq(p_sorted, ps_sorted)
-        self.assert_eq(p_indexer, ps_indexer.to_list())
-        self.assert_eq(
-            pidx.sort_values(return_indexer=False, ascending=False),
-            psidx.sort_values(return_indexer=False, ascending=False),
-        )
-
-    def test_sort_values(self):
-        pidx = pd.Index([-10, -100, 200, 100])
-        psidx = ps.from_pandas(pidx)
-
-        self._test_sort_values(pidx, psidx)
-
-        pidx.name = "koalas"
-        psidx.name = "koalas"
-
-        self._test_sort_values(pidx, psidx)
-
-        pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
-        psidx = ps.from_pandas(pidx)
-
-        pidx.names = ["hello", "koalas", "goodbye"]
-        psidx.names = ["hello", "koalas", "goodbye"]
-
-        self._test_sort_values(pidx, psidx)
-
-    def test_index_sort(self):
-        idx = ps.Index([1, 2, 3, 4, 5])
-        midx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)])
-
-        with self.assertRaisesRegex(
-            TypeError, "cannot sort an Index object in-place, use sort_values instead"
-        ):
-            idx.sort()
-        with self.assertRaisesRegex(
-            TypeError, "cannot sort an Index object in-place, use sort_values instead"
-        ):
-            midx.sort()
-
     def test_multiindex_isna(self):
         psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
 
@@ -671,18 +494,6 @@ class IndexesTestsMixin:
         with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"):
             psidx.notnull()
 
-    def test_index_nunique(self):
-        pidx = pd.Index([1, 1, 2, None])
-        psidx = ps.from_pandas(pidx)
-
-        self.assert_eq(pidx.nunique(), psidx.nunique())
-        self.assert_eq(pidx.nunique(dropna=True), psidx.nunique(dropna=True))
-
-    def test_multiindex_nunique(self):
-        psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
-        with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"):
-            psidx.notnull()
-
     def test_multiindex_rename(self):
         pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
         psidx = ps.from_pandas(pidx)
@@ -829,63 +640,6 @@ class IndexesTestsMixin:
         self.assertRaises(ValueError, lambda: psmidx.repeat(-1))
         self.assertRaises(TypeError, lambda: psmidx.repeat("abc"))
 
-    def test_unique(self):
-        pidx = pd.Index(["a", "b", "a"])
-        psidx = ps.from_pandas(pidx)
-
-        self.assert_eq(psidx.unique().sort_values(), pidx.unique().sort_values())
-        self.assert_eq(psidx.unique().sort_values(), pidx.unique().sort_values())
-
-        pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "a")])
-        psmidx = ps.from_pandas(pmidx)
-
-        self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values())
-        self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values())
-
-        with self.assertRaisesRegex(
-            IndexError, "Too many levels: Index has only 1 level, -2 is not a valid level number"
-        ):
-            psidx.unique(level=-2)
-
-    def test_take(self):
-        # Index
-        pidx = pd.Index([100, 200, 300, 400, 500], name="Koalas")
-        psidx = ps.from_pandas(pidx)
-
-        self.assert_eq(psidx.take([0, 2, 4]).sort_values(), pidx.take([0, 2, 4]).sort_values())
-        self.assert_eq(
-            psidx.take(range(0, 5, 2)).sort_values(), pidx.take(range(0, 5, 2)).sort_values()
-        )
-        self.assert_eq(psidx.take([-4, -2, 0]).sort_values(), pidx.take([-4, -2, 0]).sort_values())
-        self.assert_eq(
-            psidx.take(range(-4, 1, 2)).sort_values(), pidx.take(range(-4, 1, 2)).sort_values()
-        )
-
-        # MultiIndex
-        pmidx = pd.MultiIndex.from_tuples(
-            [("x", "a"), ("x", "b"), ("x", "c")], names=["hello", "Koalas"]
-        )
-        psmidx = ps.from_pandas(pmidx)
-
-        self.assert_eq(psmidx.take([0, 2]).sort_values(), pmidx.take([0, 2]).sort_values())
-        self.assert_eq(
-            psmidx.take(range(0, 4, 2)).sort_values(), pmidx.take(range(0, 4, 2)).sort_values()
-        )
-        self.assert_eq(psmidx.take([-2, 0]).sort_values(), pmidx.take([-2, 0]).sort_values())
-        self.assert_eq(
-            psmidx.take(range(-2, 1, 2)).sort_values(), pmidx.take(range(-2, 1, 2)).sort_values()
-        )
-
-        # Checking the type of indices.
-        self.assertRaises(TypeError, lambda: psidx.take(1))
-        self.assertRaises(TypeError, lambda: psidx.take("1"))
-        self.assertRaises(TypeError, lambda: psidx.take({1, 2}))
-        self.assertRaises(TypeError, lambda: psidx.take({1: None, 2: None}))
-        self.assertRaises(TypeError, lambda: psmidx.take(1))
-        self.assertRaises(TypeError, lambda: psmidx.take("1"))
-        self.assertRaises(TypeError, lambda: psmidx.take({1, 2}))
-        self.assertRaises(TypeError, lambda: psmidx.take({1: None, 2: None}))
-
     def test_index_get_level_values(self):
         pidx = pd.Index([1, 2, 3], name="ks")
         psidx = ps.from_pandas(pidx)
@@ -1056,32 +810,6 @@ class IndexesTestsMixin:
         psmidx = ps.from_pandas(pmidx)
         self.assert_eq(pmidx.inferred_type, psmidx.inferred_type)
 
-    def test_index_is_unique(self):
-        indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
-        names = [None, "ks", "ks", None]
-        is_uniq = [True, False, False, True]
-
-        for idx, name, expected in zip(indexes, names, is_uniq):
-            pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name))
-            psdf = ps.from_pandas(pdf)
-
-            self.assertEqual(psdf.index.is_unique, expected)
-
-    def test_multiindex_is_unique(self):
-        indexes = [
-            [list("abc"), list("edf")],
-            [list("aac"), list("edf")],
-            [list("aac"), list("eef")],
-            [[1, 4, 4], [4, 6, 6]],
-        ]
-        is_uniq = [True, True, False, False]
-
-        for idx, expected in zip(indexes, is_uniq):
-            pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx)
-            psdf = ps.from_pandas(pdf)
-
-            self.assertEqual(psdf.index.is_unique, expected)
-
     def test_view(self):
         pidx = pd.Index([1, 2, 3, 4], name="Koalas")
         psidx = ps.from_pandas(pidx)
@@ -1165,14 +893,6 @@ class IndexesTestsMixin:
         psmidx2 = ps.from_pandas(pmidx2)
         self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
 
-    def test_multi_index_nunique(self):
-        tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "green")]
-        pmidx = pd.MultiIndex.from_tuples(tuples)
-        psmidx = ps.from_pandas(pmidx)
-
-        with self.assertRaisesRegex(NotImplementedError, "nunique is not defined for MultiIndex"):
-            psmidx.nunique()
-
 
 class IndexesTests(
     IndexesTestsMixin,
diff --git a/python/pyspark/pandas/tests/indexes/test_sort.py b/python/pyspark/pandas/tests/indexes/test_sort.py
new file mode 100644
index 000000000000..8e38f2cff3f3
--- /dev/null
+++ b/python/pyspark/pandas/tests/indexes/test_sort.py
@@ -0,0 +1,99 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import unittest
+
+import pandas as pd
+
+import pyspark.pandas as ps
+from pyspark.testing.pandasutils import ComparisonTestBase, TestUtils
+
+
+class IndexesSortMixin:
+    def _test_sort_values(self, pidx, psidx):
+        self.assert_eq(pidx.sort_values(), psidx.sort_values())
+        # Parameter ascending
+        self.assert_eq(pidx.sort_values(ascending=False), psidx.sort_values(ascending=False))
+        # Parameter return_indexer
+        p_sorted, p_indexer = pidx.sort_values(return_indexer=True)
+        ps_sorted, ps_indexer = psidx.sort_values(return_indexer=True)
+        self.assert_eq(p_sorted, ps_sorted)
+        self.assert_eq(p_indexer, ps_indexer.to_list())
+        self.assert_eq(
+            pidx.sort_values(return_indexer=False), psidx.sort_values(return_indexer=False)
+        )
+        # Parameter return_indexer and ascending
+        p_sorted, p_indexer = pidx.sort_values(return_indexer=True, ascending=False)
+        ps_sorted, ps_indexer = psidx.sort_values(return_indexer=True, ascending=False)
+        self.assert_eq(p_sorted, ps_sorted)
+        self.assert_eq(p_indexer, ps_indexer.to_list())
+        self.assert_eq(
+            pidx.sort_values(return_indexer=False, ascending=False),
+            psidx.sort_values(return_indexer=False, ascending=False),
+        )
+
+    def test_sort_values(self):
+        pidx = pd.Index([-10, -100, 200, 100])
+        psidx = ps.from_pandas(pidx)
+
+        self._test_sort_values(pidx, psidx)
+
+        pidx.name = "koalas"
+        psidx.name = "koalas"
+
+        self._test_sort_values(pidx, psidx)
+
+        pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
+        psidx = ps.from_pandas(pidx)
+
+        pidx.names = ["hello", "koalas", "goodbye"]
+        psidx.names = ["hello", "koalas", "goodbye"]
+
+        self._test_sort_values(pidx, psidx)
+
+    def test_index_sort(self):
+        idx = ps.Index([1, 2, 3, 4, 5])
+        midx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)])
+
+        with self.assertRaisesRegex(
+            TypeError, "cannot sort an Index object in-place, use sort_values instead"
+        ):
+            idx.sort()
+        with self.assertRaisesRegex(
+            TypeError, "cannot sort an Index object in-place, use sort_values instead"
+        ):
+            midx.sort()
+
+
+class IndexesSortTests(
+    IndexesSortMixin,
+    ComparisonTestBase,
+    TestUtils,
+):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.indexes.test_sort import *  # noqa: F401
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/indexes/test_symmetric_diff.py b/python/pyspark/pandas/tests/indexes/test_symmetric_diff.py
new file mode 100644
index 000000000000..d08544557954
--- /dev/null
+++ b/python/pyspark/pandas/tests/indexes/test_symmetric_diff.py
@@ -0,0 +1,125 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import unittest
+
+import pandas as pd
+
+import pyspark.pandas as ps
+from pyspark.loose_version import LooseVersion
+from pyspark.testing.pandasutils import ComparisonTestBase, TestUtils
+
+
+class IndexesSymmetricDiffMixin:
+    def test_index_symmetric_difference(self):
+        pidx1 = pd.Index([1, 2, 3, 4])
+        pidx2 = pd.Index([2, 3, 4, 5])
+        psidx1 = ps.from_pandas(pidx1)
+        psidx2 = ps.from_pandas(pidx2)
+
+        self.assert_eq(
+            psidx1.symmetric_difference(psidx2).sort_values(),
+            pidx1.symmetric_difference(pidx2).sort_values(),
+        )
+        self.assert_eq(
+            (psidx1 + 1).symmetric_difference(psidx2).sort_values(),
+            (pidx1 + 1).symmetric_difference(pidx2).sort_values(),
+        )
+        # No longer supported from pandas 2.0.0.
+        if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
+            self.assert_eq(
+                (psidx1 ^ psidx2).sort_values(),
+                ps.Index([1, 5], dtype="int64"),
+            )
+        else:
+            self.assert_eq(
+                (psidx1 ^ psidx2).sort_values(),
+                (pidx1 ^ pidx2).sort_values(),
+            )
+        self.assert_eq(
+            psidx1.symmetric_difference(psidx2, result_name="result").sort_values(),
+            pidx1.symmetric_difference(pidx2, result_name="result").sort_values(),
+        )
+
+        pmidx1 = pd.MultiIndex(
+            [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
+            [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]],
+        )
+        pmidx2 = pd.MultiIndex(
+            [["koalas", "cow", "falcon"], ["speed", "weight", "length"]],
+            [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]],
+        )
+        psmidx1 = ps.from_pandas(pmidx1)
+        psmidx2 = ps.from_pandas(pmidx2)
+
+        self.assert_eq(
+            psmidx1.symmetric_difference(psmidx2).sort_values(),
+            pmidx1.symmetric_difference(pmidx2).sort_values(),
+        )
+
+        # Pandas has a bug that raise TypeError when setting `result_name` for MultiIndex.
+        pandas_result = pmidx1.symmetric_difference(pmidx2)
+        pandas_result.names = ["a", "b"]
+        self.assert_eq(
+            psmidx1.symmetric_difference(psmidx2, result_name=["a", "b"]).sort_values(),
+            pandas_result,
+        )
+
+        # Pandas sort the result by default, so doesn't provide the `True` for sort.
+        self.assert_eq(
+            psmidx1.symmetric_difference(psmidx2, sort=True),
+            pmidx1.symmetric_difference(pmidx2),
+        )
+
+        idx = ps.Index(["a", "b", "c"])
+        midx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
+
+        with self.assertRaisesRegex(NotImplementedError, "Doesn't support*"):
+            idx.symmetric_difference(midx)
+
+    def test_multi_index_symmetric_difference(self):
+        idx = ps.Index(["a", "b", "c"])
+        midx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
+        midx_ = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
+
+        self.assert_eq(
+            midx.symmetric_difference(midx_),
+            midx._to_pandas().symmetric_difference(midx_._to_pandas()),
+        )
+
+        with self.assertRaisesRegex(NotImplementedError, "Doesn't support*"):
+            midx.symmetric_difference(idx)
+
+
+class IndexesSymmetricDiffTests(
+    IndexesSymmetricDiffMixin,
+    ComparisonTestBase,
+    TestUtils,
+):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.indexes.test_symmetric_diff import *  # noqa: F401
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/indexes/test_take.py b/python/pyspark/pandas/tests/indexes/test_take.py
new file mode 100644
index 000000000000..efd95c12a995
--- /dev/null
+++ b/python/pyspark/pandas/tests/indexes/test_take.py
@@ -0,0 +1,84 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import unittest
+
+import pandas as pd
+
+import pyspark.pandas as ps
+from pyspark.testing.pandasutils import ComparisonTestBase, TestUtils
+
+
+class IndexesTakeMixin:
+    def test_take(self):
+        # Index
+        pidx = pd.Index([100, 200, 300, 400, 500], name="Koalas")
+        psidx = ps.from_pandas(pidx)
+
+        self.assert_eq(psidx.take([0, 2, 4]).sort_values(), pidx.take([0, 2, 4]).sort_values())
+        self.assert_eq(
+            psidx.take(range(0, 5, 2)).sort_values(), pidx.take(range(0, 5, 2)).sort_values()
+        )
+        self.assert_eq(psidx.take([-4, -2, 0]).sort_values(), pidx.take([-4, -2, 0]).sort_values())
+        self.assert_eq(
+            psidx.take(range(-4, 1, 2)).sort_values(), pidx.take(range(-4, 1, 2)).sort_values()
+        )
+
+        # MultiIndex
+        pmidx = pd.MultiIndex.from_tuples(
+            [("x", "a"), ("x", "b"), ("x", "c")], names=["hello", "Koalas"]
+        )
+        psmidx = ps.from_pandas(pmidx)
+
+        self.assert_eq(psmidx.take([0, 2]).sort_values(), pmidx.take([0, 2]).sort_values())
+        self.assert_eq(
+            psmidx.take(range(0, 4, 2)).sort_values(), pmidx.take(range(0, 4, 2)).sort_values()
+        )
+        self.assert_eq(psmidx.take([-2, 0]).sort_values(), pmidx.take([-2, 0]).sort_values())
+        self.assert_eq(
+            psmidx.take(range(-2, 1, 2)).sort_values(), pmidx.take(range(-2, 1, 2)).sort_values()
+        )
+
+        # Checking the type of indices.
+        self.assertRaises(TypeError, lambda: psidx.take(1))
+        self.assertRaises(TypeError, lambda: psidx.take("1"))
+        self.assertRaises(TypeError, lambda: psidx.take({1, 2}))
+        self.assertRaises(TypeError, lambda: psidx.take({1: None, 2: None}))
+        self.assertRaises(TypeError, lambda: psmidx.take(1))
+        self.assertRaises(TypeError, lambda: psmidx.take("1"))
+        self.assertRaises(TypeError, lambda: psmidx.take({1, 2}))
+        self.assertRaises(TypeError, lambda: psmidx.take({1: None, 2: None}))
+
+
+class IndexesTakeTests(
+    IndexesTakeMixin,
+    ComparisonTestBase,
+    TestUtils,
+):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.indexes.test_take import *  # noqa: F401
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/indexes/test_unique.py b/python/pyspark/pandas/tests/indexes/test_unique.py
new file mode 100644
index 000000000000..f983aabcee6b
--- /dev/null
+++ b/python/pyspark/pandas/tests/indexes/test_unique.py
@@ -0,0 +1,164 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
+
+
+class UniqueMixin:
+    @property
+    def pdf(self):
+        return pd.DataFrame(
+            {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]},
+            index=[0, 1, 3, 5, 6, 8, 9, 9, 9],
+        )
+
+    @property
+    def psdf(self):
+        return ps.from_pandas(self.pdf)
+
+    def test_index_unique(self):
+        psidx = self.psdf.index
+
+        # here the output is different than pandas in terms of order
+        expected = [0, 1, 3, 5, 6, 8, 9]
+
+        self.assert_eq(expected, sorted(psidx.unique()._to_pandas()))
+        self.assert_eq(expected, sorted(psidx.unique(level=0)._to_pandas()))
+
+        expected = [1, 2, 4, 6, 7, 9, 10]
+        self.assert_eq(expected, sorted((psidx + 1).unique()._to_pandas()))
+
+        with self.assertRaisesRegex(IndexError, "Too many levels*"):
+            psidx.unique(level=1)
+
+        with self.assertRaisesRegex(KeyError, "Requested level (hi)*"):
+            psidx.unique(level="hi")
+
+    def test_unique(self):
+        pidx = pd.Index(["a", "b", "a"])
+        psidx = ps.from_pandas(pidx)
+
+        self.assert_eq(psidx.unique().sort_values(), pidx.unique().sort_values())
+        self.assert_eq(psidx.unique().sort_values(), pidx.unique().sort_values())
+
+        pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "a")])
+        psmidx = ps.from_pandas(pmidx)
+
+        self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values())
+        self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values())
+
+        with self.assertRaisesRegex(
+            IndexError, "Too many levels: Index has only 1 level, -2 is not a valid level number"
+        ):
+            psidx.unique(level=-2)
+
+    def test_index_is_unique(self):
+        indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
+        names = [None, "ks", "ks", None]
+        is_uniq = [True, False, False, True]
+
+        for idx, name, expected in zip(indexes, names, is_uniq):
+            pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name))
+            psdf = ps.from_pandas(pdf)
+
+            self.assertEqual(psdf.index.is_unique, expected)
+
+    def test_multiindex_is_unique(self):
+        indexes = [
+            [list("abc"), list("edf")],
+            [list("aac"), list("edf")],
+            [list("aac"), list("eef")],
+            [[1, 4, 4], [4, 6, 6]],
+        ]
+        is_uniq = [True, True, False, False]
+
+        for idx, expected in zip(indexes, is_uniq):
+            pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx)
+            psdf = ps.from_pandas(pdf)
+
+            self.assertEqual(psdf.index.is_unique, expected)
+
+    def test_index_nunique(self):
+        pidx = pd.Index([1, 1, 2, None])
+        psidx = ps.from_pandas(pidx)
+
+        self.assert_eq(pidx.nunique(), psidx.nunique())
+        self.assert_eq(pidx.nunique(dropna=True), psidx.nunique(dropna=True))
+
+    def test_multiindex_nunique(self):
+        psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
+        with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"):
+            psidx.notnull()
+
+    def test_multi_index_nunique(self):
+        tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "green")]
+        pmidx = pd.MultiIndex.from_tuples(tuples)
+        psmidx = ps.from_pandas(pmidx)
+
+        with self.assertRaisesRegex(NotImplementedError, "nunique is not defined for MultiIndex"):
+            psmidx.nunique()
+
+    def test_index_has_duplicates(self):
+        indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
+        names = [None, "ks", "ks", None]
+        has_dup = [False, True, True, False]
+
+        for idx, name, expected in zip(indexes, names, has_dup):
+            pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name))
+            psdf = ps.from_pandas(pdf)
+
+            self.assertEqual(psdf.index.has_duplicates, expected)
+
+    def test_multiindex_has_duplicates(self):
+        indexes = [
+            [list("abc"), list("edf")],
+            [list("aac"), list("edf")],
+            [list("aac"), list("eef")],
+            [[1, 4, 4], [4, 6, 6]],
+        ]
+        has_dup = [False, False, True, True]
+
+        for idx, expected in zip(indexes, has_dup):
+            pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx)
+            psdf = ps.from_pandas(pdf)
+
+            self.assertEqual(psdf.index.has_duplicates, expected)
+
+
+class UniqueTests(
+    UniqueMixin,
+    PandasOnSparkTestCase,
+    SQLTestUtils,
+):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.indexes.test_unique import *  # noqa: F401
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org