You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ru...@apache.org on 2023/12/31 09:34:37 UTC
(spark) branch master updated: [SPARK-46504][PS][TESTS][FOLLOWUPS] Moving move slow tests out of `IndexesTests`
This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new ae9106370a9f [SPARK-46504][PS][TESTS][FOLLOWUPS] Moving move slow tests out of `IndexesTests`
ae9106370a9f is described below
commit ae9106370a9f3002979077662149ba9122d86cef
Author: Ruifeng Zheng <ru...@apache.org>
AuthorDate: Sun Dec 31 17:34:17 2023 +0800
[SPARK-46504][PS][TESTS][FOLLOWUPS] Moving move slow tests out of `IndexesTests`
### What changes were proposed in this pull request?
Moving move slow tests out of `IndexesTests`
### Why are the changes needed?
for testing parallelism
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
ci
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #44543 from zhengruifeng/ps_test_idx_base_sort_take.
Authored-by: Ruifeng Zheng <ru...@apache.org>
Signed-off-by: Ruifeng Zheng <ru...@apache.org>
---
dev/sparktestsupport/modules.py | 8 +
.../tests/connect/indexes/test_parity_sort.py | 41 +++
.../connect/indexes/test_parity_symmetric_diff.py | 41 +++
.../tests/connect/indexes/test_parity_take.py | 41 +++
.../tests/connect/indexes/test_parity_unique.py | 41 +++
python/pyspark/pandas/tests/indexes/test_base.py | 280 ---------------------
python/pyspark/pandas/tests/indexes/test_sort.py | 99 ++++++++
.../pandas/tests/indexes/test_symmetric_diff.py | 125 +++++++++
python/pyspark/pandas/tests/indexes/test_take.py | 84 +++++++
python/pyspark/pandas/tests/indexes/test_unique.py | 164 ++++++++++++
10 files changed, 644 insertions(+), 280 deletions(-)
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 7f0c31201c84..6aca31e5efdf 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -798,6 +798,10 @@ pyspark_pandas_slow = Module(
"pyspark.pandas.tests.indexes.test_base",
"pyspark.pandas.tests.indexes.test_conversion",
"pyspark.pandas.tests.indexes.test_drop",
+ "pyspark.pandas.tests.indexes.test_sort",
+ "pyspark.pandas.tests.indexes.test_symmetric_diff",
+ "pyspark.pandas.tests.indexes.test_take",
+ "pyspark.pandas.tests.indexes.test_unique",
"pyspark.pandas.tests.indexes.test_asof",
"pyspark.pandas.tests.indexes.test_astype",
"pyspark.pandas.tests.indexes.test_delete",
@@ -1090,6 +1094,10 @@ pyspark_pandas_connect_part0 = Module(
"pyspark.pandas.tests.connect.indexes.test_parity_base",
"pyspark.pandas.tests.connect.indexes.test_parity_conversion",
"pyspark.pandas.tests.connect.indexes.test_parity_drop",
+ "pyspark.pandas.tests.connect.indexes.test_parity_sort",
+ "pyspark.pandas.tests.connect.indexes.test_parity_symmetric_diff",
+ "pyspark.pandas.tests.connect.indexes.test_parity_take",
+ "pyspark.pandas.tests.connect.indexes.test_parity_unique",
"pyspark.pandas.tests.connect.indexes.test_parity_asof",
"pyspark.pandas.tests.connect.indexes.test_parity_astype",
"pyspark.pandas.tests.connect.indexes.test_parity_delete",
diff --git a/python/pyspark/pandas/tests/connect/indexes/test_parity_sort.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_sort.py
new file mode 100644
index 000000000000..1affd5c3d777
--- /dev/null
+++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_sort.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.indexes.test_sort import IndexesSortMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+
+
+class IndexesSortParityTests(
+ IndexesSortMixin,
+ PandasOnSparkTestUtils,
+ ReusedConnectTestCase,
+):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.connect.indexes.test_parity_sort import * # noqa: F401
+
+ try:
+ import xmlrunner # type: ignore[import]
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/connect/indexes/test_parity_symmetric_diff.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_symmetric_diff.py
new file mode 100644
index 000000000000..825167baffa8
--- /dev/null
+++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_symmetric_diff.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.indexes.test_symmetric_diff import IndexesSymmetricDiffMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+
+
+class IndexesSymmetricDiffParityTests(
+ IndexesSymmetricDiffMixin,
+ PandasOnSparkTestUtils,
+ ReusedConnectTestCase,
+):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.connect.indexes.test_parity_symmetric_diff import * # noqa: F401
+
+ try:
+ import xmlrunner # type: ignore[import]
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/connect/indexes/test_parity_take.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_take.py
new file mode 100644
index 000000000000..34151a4a8fb6
--- /dev/null
+++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_take.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.indexes.test_take import IndexesTakeMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+
+
+class IndexesTakeParityTests(
+ IndexesTakeMixin,
+ PandasOnSparkTestUtils,
+ ReusedConnectTestCase,
+):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.connect.indexes.test_parity_take import * # noqa: F401
+
+ try:
+ import xmlrunner # type: ignore[import]
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/connect/indexes/test_parity_unique.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_unique.py
new file mode 100644
index 000000000000..d303b8f969db
--- /dev/null
+++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_unique.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.indexes.test_unique import UniqueMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+
+
+class IndexesUniqueParityTests(
+ UniqueMixin,
+ PandasOnSparkTestUtils,
+ ReusedConnectTestCase,
+):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.connect.indexes.test_parity_unique import * # noqa: F401
+
+ try:
+ import xmlrunner # type: ignore[import]
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py
index 788319a38c2d..f7e6c553ac15 100644
--- a/python/pyspark/pandas/tests/indexes/test_base.py
+++ b/python/pyspark/pandas/tests/indexes/test_base.py
@@ -235,24 +235,6 @@ class IndexesTestsMixin:
psidx = ps.from_pandas(pidx)
self.assertEqual(pidx.levshape, psidx.levshape)
- def test_index_unique(self):
- psidx = self.psdf.index
-
- # here the output is different than pandas in terms of order
- expected = [0, 1, 3, 5, 6, 8, 9]
-
- self.assert_eq(expected, sorted(psidx.unique()._to_pandas()))
- self.assert_eq(expected, sorted(psidx.unique(level=0)._to_pandas()))
-
- expected = [1, 2, 4, 6, 7, 9, 10]
- self.assert_eq(expected, sorted((psidx + 1).unique()._to_pandas()))
-
- with self.assertRaisesRegex(IndexError, "Too many levels*"):
- psidx.unique(level=1)
-
- with self.assertRaisesRegex(KeyError, "Requested level (hi)*"):
- psidx.unique(level="hi")
-
def test_multi_index_copy(self):
arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
@@ -261,85 +243,6 @@ class IndexesTestsMixin:
self.assert_eq(psdf.index.copy(), pdf.index.copy())
- def test_index_symmetric_difference(self):
- pidx1 = pd.Index([1, 2, 3, 4])
- pidx2 = pd.Index([2, 3, 4, 5])
- psidx1 = ps.from_pandas(pidx1)
- psidx2 = ps.from_pandas(pidx2)
-
- self.assert_eq(
- psidx1.symmetric_difference(psidx2).sort_values(),
- pidx1.symmetric_difference(pidx2).sort_values(),
- )
- self.assert_eq(
- (psidx1 + 1).symmetric_difference(psidx2).sort_values(),
- (pidx1 + 1).symmetric_difference(pidx2).sort_values(),
- )
- # No longer supported from pandas 2.0.0.
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
- self.assert_eq(
- (psidx1 ^ psidx2).sort_values(),
- ps.Index([1, 5], dtype="int64"),
- )
- else:
- self.assert_eq(
- (psidx1 ^ psidx2).sort_values(),
- (pidx1 ^ pidx2).sort_values(),
- )
- self.assert_eq(
- psidx1.symmetric_difference(psidx2, result_name="result").sort_values(),
- pidx1.symmetric_difference(pidx2, result_name="result").sort_values(),
- )
-
- pmidx1 = pd.MultiIndex(
- [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
- [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]],
- )
- pmidx2 = pd.MultiIndex(
- [["koalas", "cow", "falcon"], ["speed", "weight", "length"]],
- [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]],
- )
- psmidx1 = ps.from_pandas(pmidx1)
- psmidx2 = ps.from_pandas(pmidx2)
-
- self.assert_eq(
- psmidx1.symmetric_difference(psmidx2).sort_values(),
- pmidx1.symmetric_difference(pmidx2).sort_values(),
- )
-
- # Pandas has a bug that raise TypeError when setting `result_name` for MultiIndex.
- pandas_result = pmidx1.symmetric_difference(pmidx2)
- pandas_result.names = ["a", "b"]
- self.assert_eq(
- psmidx1.symmetric_difference(psmidx2, result_name=["a", "b"]).sort_values(),
- pandas_result,
- )
-
- # Pandas sort the result by default, so doesn't provide the `True` for sort.
- self.assert_eq(
- psmidx1.symmetric_difference(psmidx2, sort=True),
- pmidx1.symmetric_difference(pmidx2),
- )
-
- idx = ps.Index(["a", "b", "c"])
- midx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
-
- with self.assertRaisesRegex(NotImplementedError, "Doesn't support*"):
- idx.symmetric_difference(midx)
-
- def test_multi_index_symmetric_difference(self):
- idx = ps.Index(["a", "b", "c"])
- midx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
- midx_ = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
-
- self.assert_eq(
- midx.symmetric_difference(midx_),
- midx._to_pandas().symmetric_difference(midx_._to_pandas()),
- )
-
- with self.assertRaisesRegex(NotImplementedError, "Doesn't support*"):
- midx.symmetric_difference(idx)
-
def test_missing(self):
psdf = ps.DataFrame(
{
@@ -521,32 +424,6 @@ class IndexesTestsMixin:
):
getattr(psdf.set_index("c").index, name)
- def test_index_has_duplicates(self):
- indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
- names = [None, "ks", "ks", None]
- has_dup = [False, True, True, False]
-
- for idx, name, expected in zip(indexes, names, has_dup):
- pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name))
- psdf = ps.from_pandas(pdf)
-
- self.assertEqual(psdf.index.has_duplicates, expected)
-
- def test_multiindex_has_duplicates(self):
- indexes = [
- [list("abc"), list("edf")],
- [list("aac"), list("edf")],
- [list("aac"), list("eef")],
- [[1, 4, 4], [4, 6, 6]],
- ]
- has_dup = [False, False, True, True]
-
- for idx, expected in zip(indexes, has_dup):
- pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx)
- psdf = ps.from_pandas(pdf)
-
- self.assertEqual(psdf.index.has_duplicates, expected)
-
def test_multi_index_not_supported(self):
psdf = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
@@ -602,60 +479,6 @@ class IndexesTestsMixin:
with self.assertRaisesRegex(TypeError, "Unsupported type list"):
psidx.fillna([1, 2])
- def _test_sort_values(self, pidx, psidx):
- self.assert_eq(pidx.sort_values(), psidx.sort_values())
- # Parameter ascending
- self.assert_eq(pidx.sort_values(ascending=False), psidx.sort_values(ascending=False))
- # Parameter return_indexer
- p_sorted, p_indexer = pidx.sort_values(return_indexer=True)
- ps_sorted, ps_indexer = psidx.sort_values(return_indexer=True)
- self.assert_eq(p_sorted, ps_sorted)
- self.assert_eq(p_indexer, ps_indexer.to_list())
- self.assert_eq(
- pidx.sort_values(return_indexer=False), psidx.sort_values(return_indexer=False)
- )
- # Parameter return_indexer and ascending
- p_sorted, p_indexer = pidx.sort_values(return_indexer=True, ascending=False)
- ps_sorted, ps_indexer = psidx.sort_values(return_indexer=True, ascending=False)
- self.assert_eq(p_sorted, ps_sorted)
- self.assert_eq(p_indexer, ps_indexer.to_list())
- self.assert_eq(
- pidx.sort_values(return_indexer=False, ascending=False),
- psidx.sort_values(return_indexer=False, ascending=False),
- )
-
- def test_sort_values(self):
- pidx = pd.Index([-10, -100, 200, 100])
- psidx = ps.from_pandas(pidx)
-
- self._test_sort_values(pidx, psidx)
-
- pidx.name = "koalas"
- psidx.name = "koalas"
-
- self._test_sort_values(pidx, psidx)
-
- pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
- psidx = ps.from_pandas(pidx)
-
- pidx.names = ["hello", "koalas", "goodbye"]
- psidx.names = ["hello", "koalas", "goodbye"]
-
- self._test_sort_values(pidx, psidx)
-
- def test_index_sort(self):
- idx = ps.Index([1, 2, 3, 4, 5])
- midx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)])
-
- with self.assertRaisesRegex(
- TypeError, "cannot sort an Index object in-place, use sort_values instead"
- ):
- idx.sort()
- with self.assertRaisesRegex(
- TypeError, "cannot sort an Index object in-place, use sort_values instead"
- ):
- midx.sort()
-
def test_multiindex_isna(self):
psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
@@ -671,18 +494,6 @@ class IndexesTestsMixin:
with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"):
psidx.notnull()
- def test_index_nunique(self):
- pidx = pd.Index([1, 1, 2, None])
- psidx = ps.from_pandas(pidx)
-
- self.assert_eq(pidx.nunique(), psidx.nunique())
- self.assert_eq(pidx.nunique(dropna=True), psidx.nunique(dropna=True))
-
- def test_multiindex_nunique(self):
- psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
- with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"):
- psidx.notnull()
-
def test_multiindex_rename(self):
pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
psidx = ps.from_pandas(pidx)
@@ -829,63 +640,6 @@ class IndexesTestsMixin:
self.assertRaises(ValueError, lambda: psmidx.repeat(-1))
self.assertRaises(TypeError, lambda: psmidx.repeat("abc"))
- def test_unique(self):
- pidx = pd.Index(["a", "b", "a"])
- psidx = ps.from_pandas(pidx)
-
- self.assert_eq(psidx.unique().sort_values(), pidx.unique().sort_values())
- self.assert_eq(psidx.unique().sort_values(), pidx.unique().sort_values())
-
- pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "a")])
- psmidx = ps.from_pandas(pmidx)
-
- self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values())
- self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values())
-
- with self.assertRaisesRegex(
- IndexError, "Too many levels: Index has only 1 level, -2 is not a valid level number"
- ):
- psidx.unique(level=-2)
-
- def test_take(self):
- # Index
- pidx = pd.Index([100, 200, 300, 400, 500], name="Koalas")
- psidx = ps.from_pandas(pidx)
-
- self.assert_eq(psidx.take([0, 2, 4]).sort_values(), pidx.take([0, 2, 4]).sort_values())
- self.assert_eq(
- psidx.take(range(0, 5, 2)).sort_values(), pidx.take(range(0, 5, 2)).sort_values()
- )
- self.assert_eq(psidx.take([-4, -2, 0]).sort_values(), pidx.take([-4, -2, 0]).sort_values())
- self.assert_eq(
- psidx.take(range(-4, 1, 2)).sort_values(), pidx.take(range(-4, 1, 2)).sort_values()
- )
-
- # MultiIndex
- pmidx = pd.MultiIndex.from_tuples(
- [("x", "a"), ("x", "b"), ("x", "c")], names=["hello", "Koalas"]
- )
- psmidx = ps.from_pandas(pmidx)
-
- self.assert_eq(psmidx.take([0, 2]).sort_values(), pmidx.take([0, 2]).sort_values())
- self.assert_eq(
- psmidx.take(range(0, 4, 2)).sort_values(), pmidx.take(range(0, 4, 2)).sort_values()
- )
- self.assert_eq(psmidx.take([-2, 0]).sort_values(), pmidx.take([-2, 0]).sort_values())
- self.assert_eq(
- psmidx.take(range(-2, 1, 2)).sort_values(), pmidx.take(range(-2, 1, 2)).sort_values()
- )
-
- # Checking the type of indices.
- self.assertRaises(TypeError, lambda: psidx.take(1))
- self.assertRaises(TypeError, lambda: psidx.take("1"))
- self.assertRaises(TypeError, lambda: psidx.take({1, 2}))
- self.assertRaises(TypeError, lambda: psidx.take({1: None, 2: None}))
- self.assertRaises(TypeError, lambda: psmidx.take(1))
- self.assertRaises(TypeError, lambda: psmidx.take("1"))
- self.assertRaises(TypeError, lambda: psmidx.take({1, 2}))
- self.assertRaises(TypeError, lambda: psmidx.take({1: None, 2: None}))
-
def test_index_get_level_values(self):
pidx = pd.Index([1, 2, 3], name="ks")
psidx = ps.from_pandas(pidx)
@@ -1056,32 +810,6 @@ class IndexesTestsMixin:
psmidx = ps.from_pandas(pmidx)
self.assert_eq(pmidx.inferred_type, psmidx.inferred_type)
- def test_index_is_unique(self):
- indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
- names = [None, "ks", "ks", None]
- is_uniq = [True, False, False, True]
-
- for idx, name, expected in zip(indexes, names, is_uniq):
- pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name))
- psdf = ps.from_pandas(pdf)
-
- self.assertEqual(psdf.index.is_unique, expected)
-
- def test_multiindex_is_unique(self):
- indexes = [
- [list("abc"), list("edf")],
- [list("aac"), list("edf")],
- [list("aac"), list("eef")],
- [[1, 4, 4], [4, 6, 6]],
- ]
- is_uniq = [True, True, False, False]
-
- for idx, expected in zip(indexes, is_uniq):
- pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx)
- psdf = ps.from_pandas(pdf)
-
- self.assertEqual(psdf.index.is_unique, expected)
-
def test_view(self):
pidx = pd.Index([1, 2, 3, 4], name="Koalas")
psidx = ps.from_pandas(pidx)
@@ -1165,14 +893,6 @@ class IndexesTestsMixin:
psmidx2 = ps.from_pandas(pmidx2)
self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2))
- def test_multi_index_nunique(self):
- tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "green")]
- pmidx = pd.MultiIndex.from_tuples(tuples)
- psmidx = ps.from_pandas(pmidx)
-
- with self.assertRaisesRegex(NotImplementedError, "nunique is not defined for MultiIndex"):
- psmidx.nunique()
-
class IndexesTests(
IndexesTestsMixin,
diff --git a/python/pyspark/pandas/tests/indexes/test_sort.py b/python/pyspark/pandas/tests/indexes/test_sort.py
new file mode 100644
index 000000000000..8e38f2cff3f3
--- /dev/null
+++ b/python/pyspark/pandas/tests/indexes/test_sort.py
@@ -0,0 +1,99 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import unittest
+
+import pandas as pd
+
+import pyspark.pandas as ps
+from pyspark.testing.pandasutils import ComparisonTestBase, TestUtils
+
+
+class IndexesSortMixin:
+ def _test_sort_values(self, pidx, psidx):
+ self.assert_eq(pidx.sort_values(), psidx.sort_values())
+ # Parameter ascending
+ self.assert_eq(pidx.sort_values(ascending=False), psidx.sort_values(ascending=False))
+ # Parameter return_indexer
+ p_sorted, p_indexer = pidx.sort_values(return_indexer=True)
+ ps_sorted, ps_indexer = psidx.sort_values(return_indexer=True)
+ self.assert_eq(p_sorted, ps_sorted)
+ self.assert_eq(p_indexer, ps_indexer.to_list())
+ self.assert_eq(
+ pidx.sort_values(return_indexer=False), psidx.sort_values(return_indexer=False)
+ )
+ # Parameter return_indexer and ascending
+ p_sorted, p_indexer = pidx.sort_values(return_indexer=True, ascending=False)
+ ps_sorted, ps_indexer = psidx.sort_values(return_indexer=True, ascending=False)
+ self.assert_eq(p_sorted, ps_sorted)
+ self.assert_eq(p_indexer, ps_indexer.to_list())
+ self.assert_eq(
+ pidx.sort_values(return_indexer=False, ascending=False),
+ psidx.sort_values(return_indexer=False, ascending=False),
+ )
+
+ def test_sort_values(self):
+ pidx = pd.Index([-10, -100, 200, 100])
+ psidx = ps.from_pandas(pidx)
+
+ self._test_sort_values(pidx, psidx)
+
+ pidx.name = "koalas"
+ psidx.name = "koalas"
+
+ self._test_sort_values(pidx, psidx)
+
+ pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
+ psidx = ps.from_pandas(pidx)
+
+ pidx.names = ["hello", "koalas", "goodbye"]
+ psidx.names = ["hello", "koalas", "goodbye"]
+
+ self._test_sort_values(pidx, psidx)
+
+ def test_index_sort(self):
+ idx = ps.Index([1, 2, 3, 4, 5])
+ midx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)])
+
+ with self.assertRaisesRegex(
+ TypeError, "cannot sort an Index object in-place, use sort_values instead"
+ ):
+ idx.sort()
+ with self.assertRaisesRegex(
+ TypeError, "cannot sort an Index object in-place, use sort_values instead"
+ ):
+ midx.sort()
+
+
+class IndexesSortTests(
+ IndexesSortMixin,
+ ComparisonTestBase,
+ TestUtils,
+):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.indexes.test_sort import * # noqa: F401
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/indexes/test_symmetric_diff.py b/python/pyspark/pandas/tests/indexes/test_symmetric_diff.py
new file mode 100644
index 000000000000..d08544557954
--- /dev/null
+++ b/python/pyspark/pandas/tests/indexes/test_symmetric_diff.py
@@ -0,0 +1,125 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import unittest
+
+import pandas as pd
+
+import pyspark.pandas as ps
+from pyspark.loose_version import LooseVersion
+from pyspark.testing.pandasutils import ComparisonTestBase, TestUtils
+
+
+class IndexesSymmetricDiffMixin:
+ def test_index_symmetric_difference(self):
+ pidx1 = pd.Index([1, 2, 3, 4])
+ pidx2 = pd.Index([2, 3, 4, 5])
+ psidx1 = ps.from_pandas(pidx1)
+ psidx2 = ps.from_pandas(pidx2)
+
+ self.assert_eq(
+ psidx1.symmetric_difference(psidx2).sort_values(),
+ pidx1.symmetric_difference(pidx2).sort_values(),
+ )
+ self.assert_eq(
+ (psidx1 + 1).symmetric_difference(psidx2).sort_values(),
+ (pidx1 + 1).symmetric_difference(pidx2).sort_values(),
+ )
+ # No longer supported from pandas 2.0.0.
+ if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
+ self.assert_eq(
+ (psidx1 ^ psidx2).sort_values(),
+ ps.Index([1, 5], dtype="int64"),
+ )
+ else:
+ self.assert_eq(
+ (psidx1 ^ psidx2).sort_values(),
+ (pidx1 ^ pidx2).sort_values(),
+ )
+ self.assert_eq(
+ psidx1.symmetric_difference(psidx2, result_name="result").sort_values(),
+ pidx1.symmetric_difference(pidx2, result_name="result").sort_values(),
+ )
+
+ pmidx1 = pd.MultiIndex(
+ [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
+ [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]],
+ )
+ pmidx2 = pd.MultiIndex(
+ [["koalas", "cow", "falcon"], ["speed", "weight", "length"]],
+ [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]],
+ )
+ psmidx1 = ps.from_pandas(pmidx1)
+ psmidx2 = ps.from_pandas(pmidx2)
+
+ self.assert_eq(
+ psmidx1.symmetric_difference(psmidx2).sort_values(),
+ pmidx1.symmetric_difference(pmidx2).sort_values(),
+ )
+
+ # Pandas has a bug that raise TypeError when setting `result_name` for MultiIndex.
+ pandas_result = pmidx1.symmetric_difference(pmidx2)
+ pandas_result.names = ["a", "b"]
+ self.assert_eq(
+ psmidx1.symmetric_difference(psmidx2, result_name=["a", "b"]).sort_values(),
+ pandas_result,
+ )
+
+ # Pandas sort the result by default, so doesn't provide the `True` for sort.
+ self.assert_eq(
+ psmidx1.symmetric_difference(psmidx2, sort=True),
+ pmidx1.symmetric_difference(pmidx2),
+ )
+
+ idx = ps.Index(["a", "b", "c"])
+ midx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
+
+ with self.assertRaisesRegex(NotImplementedError, "Doesn't support*"):
+ idx.symmetric_difference(midx)
+
+ def test_multi_index_symmetric_difference(self):
+ idx = ps.Index(["a", "b", "c"])
+ midx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
+ midx_ = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
+
+ self.assert_eq(
+ midx.symmetric_difference(midx_),
+ midx._to_pandas().symmetric_difference(midx_._to_pandas()),
+ )
+
+ with self.assertRaisesRegex(NotImplementedError, "Doesn't support*"):
+ midx.symmetric_difference(idx)
+
+
+class IndexesSymmetricDiffTests(
+ IndexesSymmetricDiffMixin,
+ ComparisonTestBase,
+ TestUtils,
+):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.indexes.test_symmetric_diff import * # noqa: F401
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/indexes/test_take.py b/python/pyspark/pandas/tests/indexes/test_take.py
new file mode 100644
index 000000000000..efd95c12a995
--- /dev/null
+++ b/python/pyspark/pandas/tests/indexes/test_take.py
@@ -0,0 +1,84 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import unittest
+
+import pandas as pd
+
+import pyspark.pandas as ps
+from pyspark.testing.pandasutils import ComparisonTestBase, TestUtils
+
+
+class IndexesTakeMixin:
+ def test_take(self):
+ # Index
+ pidx = pd.Index([100, 200, 300, 400, 500], name="Koalas")
+ psidx = ps.from_pandas(pidx)
+
+ self.assert_eq(psidx.take([0, 2, 4]).sort_values(), pidx.take([0, 2, 4]).sort_values())
+ self.assert_eq(
+ psidx.take(range(0, 5, 2)).sort_values(), pidx.take(range(0, 5, 2)).sort_values()
+ )
+ self.assert_eq(psidx.take([-4, -2, 0]).sort_values(), pidx.take([-4, -2, 0]).sort_values())
+ self.assert_eq(
+ psidx.take(range(-4, 1, 2)).sort_values(), pidx.take(range(-4, 1, 2)).sort_values()
+ )
+
+ # MultiIndex
+ pmidx = pd.MultiIndex.from_tuples(
+ [("x", "a"), ("x", "b"), ("x", "c")], names=["hello", "Koalas"]
+ )
+ psmidx = ps.from_pandas(pmidx)
+
+ self.assert_eq(psmidx.take([0, 2]).sort_values(), pmidx.take([0, 2]).sort_values())
+ self.assert_eq(
+ psmidx.take(range(0, 4, 2)).sort_values(), pmidx.take(range(0, 4, 2)).sort_values()
+ )
+ self.assert_eq(psmidx.take([-2, 0]).sort_values(), pmidx.take([-2, 0]).sort_values())
+ self.assert_eq(
+ psmidx.take(range(-2, 1, 2)).sort_values(), pmidx.take(range(-2, 1, 2)).sort_values()
+ )
+
+ # Checking the type of indices.
+ self.assertRaises(TypeError, lambda: psidx.take(1))
+ self.assertRaises(TypeError, lambda: psidx.take("1"))
+ self.assertRaises(TypeError, lambda: psidx.take({1, 2}))
+ self.assertRaises(TypeError, lambda: psidx.take({1: None, 2: None}))
+ self.assertRaises(TypeError, lambda: psmidx.take(1))
+ self.assertRaises(TypeError, lambda: psmidx.take("1"))
+ self.assertRaises(TypeError, lambda: psmidx.take({1, 2}))
+ self.assertRaises(TypeError, lambda: psmidx.take({1: None, 2: None}))
+
+
+class IndexesTakeTests(
+ IndexesTakeMixin,
+ ComparisonTestBase,
+ TestUtils,
+):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.indexes.test_take import * # noqa: F401
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/indexes/test_unique.py b/python/pyspark/pandas/tests/indexes/test_unique.py
new file mode 100644
index 000000000000..f983aabcee6b
--- /dev/null
+++ b/python/pyspark/pandas/tests/indexes/test_unique.py
@@ -0,0 +1,164 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
+
+
+class UniqueMixin:
+ @property
+ def pdf(self):
+ return pd.DataFrame(
+ {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]},
+ index=[0, 1, 3, 5, 6, 8, 9, 9, 9],
+ )
+
+ @property
+ def psdf(self):
+ return ps.from_pandas(self.pdf)
+
+ def test_index_unique(self):
+ psidx = self.psdf.index
+
+ # here the output is different than pandas in terms of order
+ expected = [0, 1, 3, 5, 6, 8, 9]
+
+ self.assert_eq(expected, sorted(psidx.unique()._to_pandas()))
+ self.assert_eq(expected, sorted(psidx.unique(level=0)._to_pandas()))
+
+ expected = [1, 2, 4, 6, 7, 9, 10]
+ self.assert_eq(expected, sorted((psidx + 1).unique()._to_pandas()))
+
+ with self.assertRaisesRegex(IndexError, "Too many levels*"):
+ psidx.unique(level=1)
+
+ with self.assertRaisesRegex(KeyError, "Requested level (hi)*"):
+ psidx.unique(level="hi")
+
+ def test_unique(self):
+ pidx = pd.Index(["a", "b", "a"])
+ psidx = ps.from_pandas(pidx)
+
+ self.assert_eq(psidx.unique().sort_values(), pidx.unique().sort_values())
+ self.assert_eq(psidx.unique().sort_values(), pidx.unique().sort_values())
+
+ pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "a")])
+ psmidx = ps.from_pandas(pmidx)
+
+ self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values())
+ self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values())
+
+ with self.assertRaisesRegex(
+ IndexError, "Too many levels: Index has only 1 level, -2 is not a valid level number"
+ ):
+ psidx.unique(level=-2)
+
+ def test_index_is_unique(self):
+ indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
+ names = [None, "ks", "ks", None]
+ is_uniq = [True, False, False, True]
+
+ for idx, name, expected in zip(indexes, names, is_uniq):
+ pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name))
+ psdf = ps.from_pandas(pdf)
+
+ self.assertEqual(psdf.index.is_unique, expected)
+
+ def test_multiindex_is_unique(self):
+ indexes = [
+ [list("abc"), list("edf")],
+ [list("aac"), list("edf")],
+ [list("aac"), list("eef")],
+ [[1, 4, 4], [4, 6, 6]],
+ ]
+ is_uniq = [True, True, False, False]
+
+ for idx, expected in zip(indexes, is_uniq):
+ pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx)
+ psdf = ps.from_pandas(pdf)
+
+ self.assertEqual(psdf.index.is_unique, expected)
+
+ def test_index_nunique(self):
+ pidx = pd.Index([1, 1, 2, None])
+ psidx = ps.from_pandas(pidx)
+
+ self.assert_eq(pidx.nunique(), psidx.nunique())
+ self.assert_eq(pidx.nunique(dropna=True), psidx.nunique(dropna=True))
+
+ def test_multiindex_nunique(self):
+ psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
+ with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"):
+ psidx.notnull()
+
+ def test_multi_index_nunique(self):
+ tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "green")]
+ pmidx = pd.MultiIndex.from_tuples(tuples)
+ psmidx = ps.from_pandas(pmidx)
+
+ with self.assertRaisesRegex(NotImplementedError, "nunique is not defined for MultiIndex"):
+ psmidx.nunique()
+
+ def test_index_has_duplicates(self):
+ indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
+ names = [None, "ks", "ks", None]
+ has_dup = [False, True, True, False]
+
+ for idx, name, expected in zip(indexes, names, has_dup):
+ pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name))
+ psdf = ps.from_pandas(pdf)
+
+ self.assertEqual(psdf.index.has_duplicates, expected)
+
+ def test_multiindex_has_duplicates(self):
+ indexes = [
+ [list("abc"), list("edf")],
+ [list("aac"), list("edf")],
+ [list("aac"), list("eef")],
+ [[1, 4, 4], [4, 6, 6]],
+ ]
+ has_dup = [False, False, True, True]
+
+ for idx, expected in zip(indexes, has_dup):
+ pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx)
+ psdf = ps.from_pandas(pdf)
+
+ self.assertEqual(psdf.index.has_duplicates, expected)
+
+
+class UniqueTests(
+ UniqueMixin,
+ PandasOnSparkTestCase,
+ SQLTestUtils,
+):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.indexes.test_unique import * # noqa: F401
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org