You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ru...@apache.org on 2024/01/11 02:52:49 UTC
(spark) branch master updated: [SPARK-46656][PS][TESTS] Split `GroupbyParitySplitApplyTests`
This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 514ecc6fc183 [SPARK-46656][PS][TESTS] Split `GroupbyParitySplitApplyTests`
514ecc6fc183 is described below
commit 514ecc6fc183d7222b9dc299af4df328c71966d1
Author: Ruifeng Zheng <ru...@apache.org>
AuthorDate: Thu Jan 11 10:52:32 2024 +0800
[SPARK-46656][PS][TESTS] Split `GroupbyParitySplitApplyTests`
### What changes were proposed in this pull request?
Split `GroupbyParitySplitApplyTests`
### Why are the changes needed?
to testing parallelism
this test normally takes 10 mins:
```
Starting test(python3.9): pyspark.pandas.tests.connect.groupby.test_parity_split_apply (temp output: /__w/spark/spark/python/target/fb71133e-7d03-4c9b-8a64-10e1d02d6bb6/python3.9__pyspark.pandas.tests.connect.groupby.test_parity_split_apply__6wojkexo.log)
Finished test(python3.9): pyspark.pandas.tests.connect.groupby.test_parity_split_apply (598s)
```
### Does this PR introduce _any_ user-facing change?
no, test-only
### How was this patch tested?
ci
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #44664 from zhengruifeng/ps_test_split_apply.
Authored-by: Ruifeng Zheng <ru...@apache.org>
Signed-off-by: Ruifeng Zheng <ru...@apache.org>
---
dev/sparktestsupport/modules.py | 6 ++++
.../connect/groupby/test_parity_split_apply.py | 4 ++-
...lit_apply.py => test_parity_split_apply_adv.py} | 10 ++++---
...t_apply.py => test_parity_split_apply_basic.py} | 10 ++++---
...apply.py => test_parity_split_apply_min_max.py} | 10 ++++---
.../pandas/tests/groupby/test_split_apply.py | 32 ++++++++++++++++------
.../test_split_apply_adv.py} | 24 +++++++++++-----
.../test_split_apply_basic.py} | 24 +++++++++++-----
.../test_split_apply_min_max.py} | 24 +++++++++++-----
9 files changed, 102 insertions(+), 42 deletions(-)
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index b8ae23613688..abeb1aa5666a 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -888,6 +888,9 @@ pyspark_pandas_slow = Module(
"pyspark.pandas.tests.groupby.test_rank",
"pyspark.pandas.tests.groupby.test_size",
"pyspark.pandas.tests.groupby.test_split_apply",
+ "pyspark.pandas.tests.groupby.test_split_apply_adv",
+ "pyspark.pandas.tests.groupby.test_split_apply_basic",
+ "pyspark.pandas.tests.groupby.test_split_apply_min_max",
"pyspark.pandas.tests.groupby.test_stat",
"pyspark.pandas.tests.groupby.test_stat_adv",
"pyspark.pandas.tests.groupby.test_stat_ddof",
@@ -1174,6 +1177,9 @@ pyspark_pandas_connect_part1 = Module(
"pyspark.pandas.tests.connect.groupby.test_parity_cumulative",
"pyspark.pandas.tests.connect.groupby.test_parity_missing_data",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply",
+ "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_adv",
+ "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_basic",
+ "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_min_max",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_align",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_basic_slow",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_cov",
diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
index 895fe984be27..b5678f91ab02 100644
--- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
+++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
@@ -22,7 +22,9 @@ from pyspark.testing.pandasutils import PandasOnSparkTestUtils
class GroupbyParitySplitApplyTests(
- GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
+ GroupbySplitApplyMixin,
+ PandasOnSparkTestUtils,
+ ReusedConnectTestCase,
):
pass
diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_adv.py
similarity index 84%
copy from python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
copy to python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_adv.py
index 895fe984be27..f8ddd8b8c9ab 100644
--- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
+++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_adv.py
@@ -16,19 +16,21 @@
#
import unittest
-from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyMixin
+from pyspark.pandas.tests.groupby.test_split_apply_adv import GroupbySplitApplyAdvMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils
-class GroupbyParitySplitApplyTests(
- GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
+class GroupbySplitApplyAdvParityTests(
+ GroupbySplitApplyAdvMixin,
+ PandasOnSparkTestUtils,
+ ReusedConnectTestCase,
):
pass
if __name__ == "__main__":
- from pyspark.pandas.tests.connect.groupby.test_parity_split_apply import * # noqa: F401
+ from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_adv import * # noqa
try:
import xmlrunner # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_basic.py
similarity index 83%
copy from python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
copy to python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_basic.py
index 895fe984be27..2964213ab484 100644
--- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
+++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_basic.py
@@ -16,19 +16,21 @@
#
import unittest
-from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyMixin
+from pyspark.pandas.tests.groupby.test_split_apply_basic import GroupbySplitApplyBasicMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils
-class GroupbyParitySplitApplyTests(
- GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
+class GroupbySplitApplyBasicParityTests(
+ GroupbySplitApplyBasicMixin,
+ PandasOnSparkTestUtils,
+ ReusedConnectTestCase,
):
pass
if __name__ == "__main__":
- from pyspark.pandas.tests.connect.groupby.test_parity_split_apply import * # noqa: F401
+ from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_basic import * # noqa
try:
import xmlrunner # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_min_max.py
similarity index 83%
copy from python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
copy to python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_min_max.py
index 895fe984be27..1d0e2eb2957c 100644
--- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
+++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_min_max.py
@@ -16,19 +16,21 @@
#
import unittest
-from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyMixin
+from pyspark.pandas.tests.groupby.test_split_apply_min_max import GroupbySplitApplyMMMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils
-class GroupbyParitySplitApplyTests(
- GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
+class GroupbySplitApplyMMParityTests(
+ GroupbySplitApplyMMMixin,
+ PandasOnSparkTestUtils,
+ ReusedConnectTestCase,
):
pass
if __name__ == "__main__":
- from pyspark.pandas.tests.connect.groupby.test_parity_split_apply import * # noqa: F401
+ from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_min_max import * # noqa
try:
import xmlrunner # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/groupby/test_split_apply.py b/python/pyspark/pandas/tests/groupby/test_split_apply.py
index 13cdae94c0e3..8251ba0e9bb2 100644
--- a/python/pyspark/pandas/tests/groupby/test_split_apply.py
+++ b/python/pyspark/pandas/tests/groupby/test_split_apply.py
@@ -19,12 +19,12 @@ import unittest
import pandas as pd
from pyspark import pandas as ps
-from pyspark.testing.pandasutils import ComparisonTestBase
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
-class GroupbySplitApplyMixin:
- def test_split_apply_combine_on_series(self):
+class GroupbySplitApplyTestingFuncMixin:
+ def _test_split_apply_func(self, funcs):
# TODO(SPARK-45228): Enabling string type columns for `test_split_apply_combine_on_series`
# when Pandas regression is fixed
# There is a regression in Pandas 2.1.0,
@@ -42,11 +42,14 @@ class GroupbySplitApplyMixin:
psdf = ps.from_pandas(pdf)
funcs = [
- ((True, False), ["sum", "min", "max", "count", "first", "last"]),
- ((True, True), ["mean"]),
- ((False, False), ["var", "std", "skew"]),
+ (
+ check_exact,
+ almost,
+ f,
+ )
+ for (check_exact, almost), fs in funcs
+ for f in fs
]
- funcs = [(check_exact, almost, f) for (check_exact, almost), fs in funcs for f in fs]
for as_index in [True, False]:
if as_index:
@@ -155,7 +158,20 @@ class GroupbySplitApplyMixin:
)
-class GroupbySplitApplyTests(GroupbySplitApplyMixin, ComparisonTestBase, SQLTestUtils):
+class GroupbySplitApplyMixin(GroupbySplitApplyTestingFuncMixin):
+ def test_split_apply_combine_on_series(self):
+ funcs = [
+ ((True, False), ["sum"]),
+ ((True, True), ["mean"]),
+ ]
+ self._test_split_apply_func(funcs)
+
+
+class GroupbySplitApplyTests(
+ GroupbySplitApplyMixin,
+ PandasOnSparkTestCase,
+ SQLTestUtils,
+):
pass
diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/groupby/test_split_apply_adv.py
similarity index 64%
copy from python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
copy to python/pyspark/pandas/tests/groupby/test_split_apply_adv.py
index 895fe984be27..abce6d5ed4f0 100644
--- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
+++ b/python/pyspark/pandas/tests/groupby/test_split_apply_adv.py
@@ -16,22 +16,32 @@
#
import unittest
-from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyMixin
-from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
+from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyTestingFuncMixin
-class GroupbyParitySplitApplyTests(
- GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
+class GroupbySplitApplyAdvMixin(GroupbySplitApplyTestingFuncMixin):
+ def test_split_apply_combine_on_series(self):
+ funcs = [
+ ((False, False), ["var", "std", "skew"]),
+ ]
+ self._test_split_apply_func(funcs)
+
+
+class GroupbySplitApplyAdvTests(
+ GroupbySplitApplyAdvMixin,
+ PandasOnSparkTestCase,
+ SQLTestUtils,
):
pass
if __name__ == "__main__":
- from pyspark.pandas.tests.connect.groupby.test_parity_split_apply import * # noqa: F401
+ from pyspark.pandas.tests.groupby.test_split_apply_adv import * # noqa: F401
try:
- import xmlrunner # type: ignore[import]
+ import xmlrunner
testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/groupby/test_split_apply_basic.py
similarity index 64%
copy from python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
copy to python/pyspark/pandas/tests/groupby/test_split_apply_basic.py
index 895fe984be27..17c6179d19ac 100644
--- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
+++ b/python/pyspark/pandas/tests/groupby/test_split_apply_basic.py
@@ -16,22 +16,32 @@
#
import unittest
-from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyMixin
-from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
+from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyTestingFuncMixin
-class GroupbyParitySplitApplyTests(
- GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
+class GroupbySplitApplyBasicMixin(GroupbySplitApplyTestingFuncMixin):
+ def test_split_apply_combine_on_series(self):
+ funcs = [
+ ((True, False), ["count", "first", "last"]),
+ ]
+ self._test_split_apply_func(funcs)
+
+
+class GroupbySplitApplyBasicTests(
+ GroupbySplitApplyBasicMixin,
+ PandasOnSparkTestCase,
+ SQLTestUtils,
):
pass
if __name__ == "__main__":
- from pyspark.pandas.tests.connect.groupby.test_parity_split_apply import * # noqa: F401
+ from pyspark.pandas.tests.groupby.test_split_apply_basic import * # noqa: F401
try:
- import xmlrunner # type: ignore[import]
+ import xmlrunner
testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/groupby/test_split_apply_min_max.py
similarity index 65%
copy from python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
copy to python/pyspark/pandas/tests/groupby/test_split_apply_min_max.py
index 895fe984be27..c16c23323a8c 100644
--- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
+++ b/python/pyspark/pandas/tests/groupby/test_split_apply_min_max.py
@@ -16,22 +16,32 @@
#
import unittest
-from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyMixin
-from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
+from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyTestingFuncMixin
-class GroupbyParitySplitApplyTests(
- GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
+class GroupbySplitApplyMMMixin(GroupbySplitApplyTestingFuncMixin):
+ def test_split_apply_combine_on_series(self):
+ funcs = [
+ ((True, False), ["min", "max"]),
+ ]
+ self._test_split_apply_func(funcs)
+
+
+class GroupbySplitApplyMMTests(
+ GroupbySplitApplyMMMixin,
+ PandasOnSparkTestCase,
+ SQLTestUtils,
):
pass
if __name__ == "__main__":
- from pyspark.pandas.tests.connect.groupby.test_parity_split_apply import * # noqa: F401
+ from pyspark.pandas.tests.groupby.test_split_apply_min_max import * # noqa: F401
try:
- import xmlrunner # type: ignore[import]
+ import xmlrunner
testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org