You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ru...@apache.org on 2024/01/11 02:52:49 UTC

(spark) branch master updated: [SPARK-46656][PS][TESTS] Split `GroupbyParitySplitApplyTests`

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 514ecc6fc183 [SPARK-46656][PS][TESTS] Split `GroupbyParitySplitApplyTests`
514ecc6fc183 is described below

commit 514ecc6fc183d7222b9dc299af4df328c71966d1
Author: Ruifeng Zheng <ru...@apache.org>
AuthorDate: Thu Jan 11 10:52:32 2024 +0800

    [SPARK-46656][PS][TESTS] Split `GroupbyParitySplitApplyTests`
    
    ### What changes were proposed in this pull request?
    Split `GroupbyParitySplitApplyTests`
    
    ### Why are the changes needed?
    to testing parallelism
    
    this test normally takes 10 mins:
    ```
    Starting test(python3.9): pyspark.pandas.tests.connect.groupby.test_parity_split_apply (temp output: /__w/spark/spark/python/target/fb71133e-7d03-4c9b-8a64-10e1d02d6bb6/python3.9__pyspark.pandas.tests.connect.groupby.test_parity_split_apply__6wojkexo.log)
    Finished test(python3.9): pyspark.pandas.tests.connect.groupby.test_parity_split_apply (598s)
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    no, test-only
    
    ### How was this patch tested?
    ci
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #44664 from zhengruifeng/ps_test_split_apply.
    
    Authored-by: Ruifeng Zheng <ru...@apache.org>
    Signed-off-by: Ruifeng Zheng <ru...@apache.org>
---
 dev/sparktestsupport/modules.py                    |  6 ++++
 .../connect/groupby/test_parity_split_apply.py     |  4 ++-
 ...lit_apply.py => test_parity_split_apply_adv.py} | 10 ++++---
 ...t_apply.py => test_parity_split_apply_basic.py} | 10 ++++---
 ...apply.py => test_parity_split_apply_min_max.py} | 10 ++++---
 .../pandas/tests/groupby/test_split_apply.py       | 32 ++++++++++++++++------
 .../test_split_apply_adv.py}                       | 24 +++++++++++-----
 .../test_split_apply_basic.py}                     | 24 +++++++++++-----
 .../test_split_apply_min_max.py}                   | 24 +++++++++++-----
 9 files changed, 102 insertions(+), 42 deletions(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index b8ae23613688..abeb1aa5666a 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -888,6 +888,9 @@ pyspark_pandas_slow = Module(
         "pyspark.pandas.tests.groupby.test_rank",
         "pyspark.pandas.tests.groupby.test_size",
         "pyspark.pandas.tests.groupby.test_split_apply",
+        "pyspark.pandas.tests.groupby.test_split_apply_adv",
+        "pyspark.pandas.tests.groupby.test_split_apply_basic",
+        "pyspark.pandas.tests.groupby.test_split_apply_min_max",
         "pyspark.pandas.tests.groupby.test_stat",
         "pyspark.pandas.tests.groupby.test_stat_adv",
         "pyspark.pandas.tests.groupby.test_stat_ddof",
@@ -1174,6 +1177,9 @@ pyspark_pandas_connect_part1 = Module(
         "pyspark.pandas.tests.connect.groupby.test_parity_cumulative",
         "pyspark.pandas.tests.connect.groupby.test_parity_missing_data",
         "pyspark.pandas.tests.connect.groupby.test_parity_split_apply",
+        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_adv",
+        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_basic",
+        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_min_max",
         "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_align",
         "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_basic_slow",
         "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_cov",
diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
index 895fe984be27..b5678f91ab02 100644
--- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
+++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
@@ -22,7 +22,9 @@ from pyspark.testing.pandasutils import PandasOnSparkTestUtils
 
 
 class GroupbyParitySplitApplyTests(
-    GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
+    GroupbySplitApplyMixin,
+    PandasOnSparkTestUtils,
+    ReusedConnectTestCase,
 ):
     pass
 
diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_adv.py
similarity index 84%
copy from python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
copy to python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_adv.py
index 895fe984be27..f8ddd8b8c9ab 100644
--- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
+++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_adv.py
@@ -16,19 +16,21 @@
 #
 import unittest
 
-from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyMixin
+from pyspark.pandas.tests.groupby.test_split_apply_adv import GroupbySplitApplyAdvMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
 from pyspark.testing.pandasutils import PandasOnSparkTestUtils
 
 
-class GroupbyParitySplitApplyTests(
-    GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
+class GroupbySplitApplyAdvParityTests(
+    GroupbySplitApplyAdvMixin,
+    PandasOnSparkTestUtils,
+    ReusedConnectTestCase,
 ):
     pass
 
 
 if __name__ == "__main__":
-    from pyspark.pandas.tests.connect.groupby.test_parity_split_apply import *  # noqa: F401
+    from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_adv import *  # noqa
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_basic.py
similarity index 83%
copy from python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
copy to python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_basic.py
index 895fe984be27..2964213ab484 100644
--- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
+++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_basic.py
@@ -16,19 +16,21 @@
 #
 import unittest
 
-from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyMixin
+from pyspark.pandas.tests.groupby.test_split_apply_basic import GroupbySplitApplyBasicMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
 from pyspark.testing.pandasutils import PandasOnSparkTestUtils
 
 
-class GroupbyParitySplitApplyTests(
-    GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
+class GroupbySplitApplyBasicParityTests(
+    GroupbySplitApplyBasicMixin,
+    PandasOnSparkTestUtils,
+    ReusedConnectTestCase,
 ):
     pass
 
 
 if __name__ == "__main__":
-    from pyspark.pandas.tests.connect.groupby.test_parity_split_apply import *  # noqa: F401
+    from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_basic import *  # noqa
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_min_max.py
similarity index 83%
copy from python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
copy to python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_min_max.py
index 895fe984be27..1d0e2eb2957c 100644
--- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
+++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_min_max.py
@@ -16,19 +16,21 @@
 #
 import unittest
 
-from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyMixin
+from pyspark.pandas.tests.groupby.test_split_apply_min_max import GroupbySplitApplyMMMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
 from pyspark.testing.pandasutils import PandasOnSparkTestUtils
 
 
-class GroupbyParitySplitApplyTests(
-    GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
+class GroupbySplitApplyMMParityTests(
+    GroupbySplitApplyMMMixin,
+    PandasOnSparkTestUtils,
+    ReusedConnectTestCase,
 ):
     pass
 
 
 if __name__ == "__main__":
-    from pyspark.pandas.tests.connect.groupby.test_parity_split_apply import *  # noqa: F401
+    from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_min_max import *  # noqa
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/groupby/test_split_apply.py b/python/pyspark/pandas/tests/groupby/test_split_apply.py
index 13cdae94c0e3..8251ba0e9bb2 100644
--- a/python/pyspark/pandas/tests/groupby/test_split_apply.py
+++ b/python/pyspark/pandas/tests/groupby/test_split_apply.py
@@ -19,12 +19,12 @@ import unittest
 import pandas as pd
 
 from pyspark import pandas as ps
-from pyspark.testing.pandasutils import ComparisonTestBase
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
 from pyspark.testing.sqlutils import SQLTestUtils
 
 
-class GroupbySplitApplyMixin:
-    def test_split_apply_combine_on_series(self):
+class GroupbySplitApplyTestingFuncMixin:
+    def _test_split_apply_func(self, funcs):
         # TODO(SPARK-45228): Enabling string type columns for `test_split_apply_combine_on_series`
         #  when Pandas regression is fixed
         # There is a regression in Pandas 2.1.0,
@@ -42,11 +42,14 @@ class GroupbySplitApplyMixin:
         psdf = ps.from_pandas(pdf)
 
         funcs = [
-            ((True, False), ["sum", "min", "max", "count", "first", "last"]),
-            ((True, True), ["mean"]),
-            ((False, False), ["var", "std", "skew"]),
+            (
+                check_exact,
+                almost,
+                f,
+            )
+            for (check_exact, almost), fs in funcs
+            for f in fs
         ]
-        funcs = [(check_exact, almost, f) for (check_exact, almost), fs in funcs for f in fs]
 
         for as_index in [True, False]:
             if as_index:
@@ -155,7 +158,20 @@ class GroupbySplitApplyMixin:
                     )
 
 
-class GroupbySplitApplyTests(GroupbySplitApplyMixin, ComparisonTestBase, SQLTestUtils):
+class GroupbySplitApplyMixin(GroupbySplitApplyTestingFuncMixin):
+    def test_split_apply_combine_on_series(self):
+        funcs = [
+            ((True, False), ["sum"]),
+            ((True, True), ["mean"]),
+        ]
+        self._test_split_apply_func(funcs)
+
+
+class GroupbySplitApplyTests(
+    GroupbySplitApplyMixin,
+    PandasOnSparkTestCase,
+    SQLTestUtils,
+):
     pass
 
 
diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/groupby/test_split_apply_adv.py
similarity index 64%
copy from python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
copy to python/pyspark/pandas/tests/groupby/test_split_apply_adv.py
index 895fe984be27..abce6d5ed4f0 100644
--- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
+++ b/python/pyspark/pandas/tests/groupby/test_split_apply_adv.py
@@ -16,22 +16,32 @@
 #
 import unittest
 
-from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyMixin
-from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
+from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyTestingFuncMixin
 
 
-class GroupbyParitySplitApplyTests(
-    GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
+class GroupbySplitApplyAdvMixin(GroupbySplitApplyTestingFuncMixin):
+    def test_split_apply_combine_on_series(self):
+        funcs = [
+            ((False, False), ["var", "std", "skew"]),
+        ]
+        self._test_split_apply_func(funcs)
+
+
+class GroupbySplitApplyAdvTests(
+    GroupbySplitApplyAdvMixin,
+    PandasOnSparkTestCase,
+    SQLTestUtils,
 ):
     pass
 
 
 if __name__ == "__main__":
-    from pyspark.pandas.tests.connect.groupby.test_parity_split_apply import *  # noqa: F401
+    from pyspark.pandas.tests.groupby.test_split_apply_adv import *  # noqa: F401
 
     try:
-        import xmlrunner  # type: ignore[import]
+        import xmlrunner
 
         testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
     except ImportError:
diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/groupby/test_split_apply_basic.py
similarity index 64%
copy from python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
copy to python/pyspark/pandas/tests/groupby/test_split_apply_basic.py
index 895fe984be27..17c6179d19ac 100644
--- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
+++ b/python/pyspark/pandas/tests/groupby/test_split_apply_basic.py
@@ -16,22 +16,32 @@
 #
 import unittest
 
-from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyMixin
-from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
+from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyTestingFuncMixin
 
 
-class GroupbyParitySplitApplyTests(
-    GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
+class GroupbySplitApplyBasicMixin(GroupbySplitApplyTestingFuncMixin):
+    def test_split_apply_combine_on_series(self):
+        funcs = [
+            ((True, False), ["count", "first", "last"]),
+        ]
+        self._test_split_apply_func(funcs)
+
+
+class GroupbySplitApplyBasicTests(
+    GroupbySplitApplyBasicMixin,
+    PandasOnSparkTestCase,
+    SQLTestUtils,
 ):
     pass
 
 
 if __name__ == "__main__":
-    from pyspark.pandas.tests.connect.groupby.test_parity_split_apply import *  # noqa: F401
+    from pyspark.pandas.tests.groupby.test_split_apply_basic import *  # noqa: F401
 
     try:
-        import xmlrunner  # type: ignore[import]
+        import xmlrunner
 
         testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
     except ImportError:
diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/groupby/test_split_apply_min_max.py
similarity index 65%
copy from python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
copy to python/pyspark/pandas/tests/groupby/test_split_apply_min_max.py
index 895fe984be27..c16c23323a8c 100644
--- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py
+++ b/python/pyspark/pandas/tests/groupby/test_split_apply_min_max.py
@@ -16,22 +16,32 @@
 #
 import unittest
 
-from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyMixin
-from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
+from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyTestingFuncMixin
 
 
-class GroupbyParitySplitApplyTests(
-    GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
+class GroupbySplitApplyMMMixin(GroupbySplitApplyTestingFuncMixin):
+    def test_split_apply_combine_on_series(self):
+        funcs = [
+            ((True, False), ["min", "max"]),
+        ]
+        self._test_split_apply_func(funcs)
+
+
+class GroupbySplitApplyMMTests(
+    GroupbySplitApplyMMMixin,
+    PandasOnSparkTestCase,
+    SQLTestUtils,
 ):
     pass
 
 
 if __name__ == "__main__":
-    from pyspark.pandas.tests.connect.groupby.test_parity_split_apply import *  # noqa: F401
+    from pyspark.pandas.tests.groupby.test_split_apply_min_max import *  # noqa: F401
 
     try:
-        import xmlrunner  # type: ignore[import]
+        import xmlrunner
 
         testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
     except ImportError:


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org