You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/08/01 12:39:41 UTC

[spark] branch branch-3.2 updated: [SPARK-36092][INFRA][BUILD][PYTHON] Migrate to GitHub Actions with Codecov from Jenkins

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
     new 310cd8ee [SPARK-36092][INFRA][BUILD][PYTHON] Migrate to GitHub Actions with Codecov from Jenkins
310cd8ee is described below

commit 310cd8eef1f17a320bb5334280e4aac52692a3d4
Author: Hyukjin Kwon <gu...@apache.org>
AuthorDate: Sun Aug 1 21:37:19 2021 +0900

    [SPARK-36092][INFRA][BUILD][PYTHON] Migrate to GitHub Actions with Codecov from Jenkins
    
    This PR proposes to migrate Coverage report from Jenkins to GitHub Actions by setting a dailly cron job.
    
    For some background, currently PySpark code coverage is being reported in this specific Jenkins job: https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7/
    
    Because of the security issue between [Codecov service](https://app.codecov.io/gh/) and Jenkins machines, we had to work around by manually hosting a coverage site via GitHub pages, see also https://spark-test.github.io/pyspark-coverage-site/ by spark-test account (which is shared to only subset of PMC members).
    
    Since we now run the build via GitHub Actions, we can leverage [Codecov plugin](https://github.com/codecov/codecov-action), and remove the workaround we used.
    
    Virtually no. Coverage site (UI) might change but the information it holds should be virtually the same.
    
    I manually tested:
    - Scheduled run: https://github.com/HyukjinKwon/spark/actions/runs/1082261484
    - Coverage report: https://codecov.io/gh/HyukjinKwon/spark/tree/73f0291a7df1eda98045cd759303aac1c2a9c929/python/pyspark
    - Run against a PR: https://github.com/HyukjinKwon/spark/actions/runs/1082367175
    
    Closes #33591 from HyukjinKwon/SPARK-36092.
    
    Authored-by: Hyukjin Kwon <gu...@apache.org>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
    (cherry picked from commit c0d1860f256eccd09a07584b8a77e6c60cc74c46)
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 .gitignore                                         |  1 +
 README.md                                          |  2 +-
 dev/requirements.txt                               |  3 ++
 dev/run-tests.py                                   | 57 ++--------------------
 .../mllib/tests/test_streaming_algorithms.py       | 13 ++++-
 python/pyspark/streaming/tests/test_dstream.py     |  9 ++--
 python/pyspark/tests/test_context.py               |  2 +-
 python/pyspark/tests/test_worker.py                | 19 ++++++--
 python/run-tests-with-coverage                     |  2 +
 python/test_coverage/coverage_daemon.py            | 27 +++++-----
 python/test_coverage/sitecustomize.py              |  7 ++-
 11 files changed, 61 insertions(+), 81 deletions(-)

diff --git a/.gitignore b/.gitignore
index 86493a7..1a7881a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -67,6 +67,7 @@ project/plugins/src_managed/
 project/plugins/target/
 python/lib/pyspark.zip
 python/.eggs/
+python/coverage.xml
 python/deps
 python/docs/_site/
 python/docs/source/reference/**/api/
diff --git a/README.md b/README.md
index 7d2c457..a208497 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ and Structured Streaming for stream processing.
 [![GitHub Action Build](https://github.com/apache/spark/actions/workflows/build_and_test.yml/badge.svg?branch=master)](https://github.com/apache/spark/actions/workflows/build_and_test.yml?query=branch%3Amaster)
 [![Jenkins Build](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-3.2/badge/icon)](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-3.2)
 [![AppVeyor Build](https://img.shields.io/appveyor/ci/ApacheSoftwareFoundation/spark/master.svg?style=plastic&logo=appveyor)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/spark)
-[![PySpark Coverage](https://img.shields.io/badge/dynamic/xml.svg?label=pyspark%20coverage&url=https%3A%2F%2Fspark-test.github.io%2Fpyspark-coverage-site&query=%2Fhtml%2Fbody%2Fdiv%5B1%5D%2Fdiv%2Fh1%2Fspan&colorB=brightgreen&style=plastic)](https://spark-test.github.io/pyspark-coverage-site)
+[![PySpark Coverage](https://codecov.io/gh/apache/spark/branch/master/graph/badge.svg)](https://codecov.io/gh/apache/spark)
 
 
 ## Online Documentation
diff --git a/dev/requirements.txt b/dev/requirements.txt
index 34f4b88..273294a 100644
--- a/dev/requirements.txt
+++ b/dev/requirements.txt
@@ -14,6 +14,9 @@ matplotlib<3.3.0
 # PySpark test dependencies
 xmlrunner
 
+# PySpark test dependencies (optional)
+coverage
+
 # Linter
 mypy
 flake8
diff --git a/dev/run-tests.py b/dev/run-tests.py
index 507846a..4828bba 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -23,8 +23,6 @@ import os
 import re
 import sys
 import subprocess
-import glob
-import shutil
 
 from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES
 from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
@@ -522,54 +520,6 @@ def run_python_tests(test_modules, parallelism, with_coverage=False):
             x for x in ["python3.6", "python3.9", "pypy3"] if which(x)))
     run_cmd(command)
 
-    if with_coverage:
-        post_python_tests_results()
-
-
-def post_python_tests_results():
-    if "SPARK_TEST_KEY" not in os.environ:
-        print("[error] 'SPARK_TEST_KEY' environment variable was not set. Unable to post "
-              "PySpark coverage results.")
-        sys.exit(1)
-    spark_test_key = os.environ.get("SPARK_TEST_KEY")
-    # The steps below upload HTMLs to 'github.com/spark-test/pyspark-coverage-site'.
-    # 1. Clone PySpark coverage site.
-    run_cmd([
-        "git",
-        "clone",
-        "https://spark-test:%s@github.com/spark-test/pyspark-coverage-site.git" % spark_test_key])
-    # 2. Remove existing HTMLs.
-    run_cmd(["rm", "-fr"] + glob.glob("pyspark-coverage-site/*"))
-    # 3. Copy generated coverage HTMLs.
-    for f in glob.glob("%s/python/test_coverage/htmlcov/*" % SPARK_HOME):
-        shutil.copy(f, "pyspark-coverage-site/")
-    os.chdir("pyspark-coverage-site")
-    try:
-        # 4. Check out to a temporary branch.
-        run_cmd(["git", "symbolic-ref", "HEAD", "refs/heads/latest_branch"])
-        # 5. Add all the files.
-        run_cmd(["git", "add", "-A"])
-        # 6. Commit current HTMLs.
-        run_cmd([
-            "git",
-            "-c",
-            "user.name='Apache Spark Test Account'",
-            "-c",
-            "user.email='sparktestacc@gmail.com'",
-            "commit",
-            "-am",
-            "Coverage report at latest commit in Apache Spark"])
-        # 7. Delete the old branch.
-        run_cmd(["git", "branch", "-D", "gh-pages"])
-        # 8. Rename the temporary branch to master.
-        run_cmd(["git", "branch", "-m", "gh-pages"])
-        # 9. Finally, force update to our repository.
-        run_cmd(["git", "push", "-f", "origin", "gh-pages"])
-    finally:
-        os.chdir("..")
-        # 10. Remove the cloned repository.
-        shutil.rmtree("pyspark-coverage-site")
-
 
 def run_python_packaging_tests():
     set_title_and_block("Running PySpark packaging tests", "BLOCK_PYSPARK_PIP_TESTS")
@@ -815,11 +765,10 @@ def main():
 
     modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
     if modules_with_python_tests:
-        # We only run PySpark tests with coverage report in one specific job with
-        # Spark master with SBT in Jenkins.
-        is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ
         run_python_tests(
-            modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job)
+            modules_with_python_tests,
+            opts.parallelism,
+            with_coverage=os.environ.get("PYSPARK_CODECOV", "false") == "true")
         run_python_packaging_tests()
     if any(m.should_run_r_tests for m in test_modules):
         run_sparkr_tests()
diff --git a/python/pyspark/mllib/tests/test_streaming_algorithms.py b/python/pyspark/mllib/tests/test_streaming_algorithms.py
index ba0c86f..cee53b6 100644
--- a/python/pyspark/mllib/tests/test_streaming_algorithms.py
+++ b/python/pyspark/mllib/tests/test_streaming_algorithms.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 
+import os
 import unittest
 
 from numpy import array, random, exp, dot, all, mean, abs
@@ -117,7 +118,7 @@ class StreamingKMeansTest(MLLibStreamingTestCase):
             self.assertTrue(all(finalModel.centers == array(initCenters)))
             self.assertEqual(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
             return True
-        eventually(condition, catch_assertions=True)
+        eventually(condition, 90, catch_assertions=True)
 
     def test_predictOn_model(self):
         """Test that the model predicts correctly on toy data."""
@@ -251,7 +252,7 @@ class StreamingLogisticRegressionWithSGDTests(MLLibStreamingTestCase):
             return True
 
         # We want all batches to finish for this test.
-        eventually(condition, 60.0, catch_assertions=True)
+        eventually(condition, 120, catch_assertions=True)
 
         t_models = array(models)
         diff = t_models[1:] - t_models[:-1]
@@ -292,6 +293,10 @@ class StreamingLogisticRegressionWithSGDTests(MLLibStreamingTestCase):
             self.assertTrue(
                 self.calculate_accuracy_error(true, predicted) < 0.4)
 
+    @unittest.skipIf(
+        "COVERAGE_PROCESS_START" in os.environ,
+        "Flaky with coverage enabled, skipping for now."
+    )
     def test_training_and_prediction(self):
         """Test that the model improves on toy data with no. of batches"""
         input_batches = [
@@ -428,6 +433,10 @@ class StreamingLinearRegressionWithTests(MLLibStreamingTestCase):
             true, predicted = zip(*batch)
             self.assertTrue(mean(abs(array(true) - array(predicted))) < 0.1)
 
+    @unittest.skipIf(
+        "COVERAGE_PROCESS_START" in os.environ,
+        "Flaky with coverage enabled, skipping for now."
+    )
     def test_train_prediction(self):
         """Test that error on test data improves as model is trained."""
         slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25)
diff --git a/python/pyspark/streaming/tests/test_dstream.py b/python/pyspark/streaming/tests/test_dstream.py
index 596053a..5419ebb 100644
--- a/python/pyspark/streaming/tests/test_dstream.py
+++ b/python/pyspark/streaming/tests/test_dstream.py
@@ -31,8 +31,7 @@ from pyspark.testing.streamingutils import PySparkStreamingTestCase
 
 @unittest.skipIf(
     "pypy" in platform.python_implementation().lower(),
-    "The tests fail in PyPy3 implementation for an unknown reason. "
-    "With PyPy, it causes to hang DStream tests forever when Coverage report is used.")
+    "The tests fail in PyPy3 implementation for an unknown reason.")
 class BasicOperationTests(PySparkStreamingTestCase):
 
     def test_map(self):
@@ -396,8 +395,7 @@ class BasicOperationTests(PySparkStreamingTestCase):
 
 @unittest.skipIf(
     "pypy" in platform.python_implementation().lower(),
-    "The tests fail in PyPy3 implementation for an unknown reason. "
-    "With PyPy, it causes to hang DStream tests forever when Coverage report is used.")
+    "The tests fail in PyPy3 implementation for an unknown reason.")
 class WindowFunctionTests(PySparkStreamingTestCase):
 
     timeout = 15
@@ -477,8 +475,7 @@ class WindowFunctionTests(PySparkStreamingTestCase):
 
 @unittest.skipIf(
     "pypy" in platform.python_implementation().lower(),
-    "The tests fail in PyPy3 implementation for an unknown reason. "
-    "With PyPy, it causes to hang DStream tests forever when Coverage report is used.")
+    "The tests fail in PyPy3 implementation for an unknown reason.")
 class CheckpointTests(unittest.TestCase):
 
     setupCalled = False
diff --git a/python/pyspark/tests/test_context.py b/python/pyspark/tests/test_context.py
index 5015592..4611d03 100644
--- a/python/pyspark/tests/test_context.py
+++ b/python/pyspark/tests/test_context.py
@@ -230,7 +230,7 @@ class ContextTests(unittest.TestCase):
             t.daemon = True
             t.start()
             # wait for scheduler to start
-            time.sleep(1)
+            time.sleep(3)
 
             tracker = sc.statusTracker()
             jobIds = tracker.getJobIdsForGroup('test_progress_api')
diff --git a/python/pyspark/tests/test_worker.py b/python/pyspark/tests/test_worker.py
index a77d38e..ebf1e93 100644
--- a/python/pyspark/tests/test_worker.py
+++ b/python/pyspark/tests/test_worker.py
@@ -57,13 +57,22 @@ class WorkerTests(ReusedPySparkTestCase):
         t.start()
 
         daemon_pid, worker_pid = 0, 0
+        cnt = 0
         while True:
             if os.path.exists(path):
                 with open(path) as f:
                     data = f.read().split(' ')
-                daemon_pid, worker_pid = map(int, data)
-                break
-            time.sleep(0.1)
+                try:
+                    daemon_pid, worker_pid = map(int, data)
+                except ValueError:
+                    pass
+                    # In case the value is not written yet.
+                    cnt += 1
+                    if cnt == 10:
+                        raise
+                else:
+                    break
+            time.sleep(1)
 
         # cancel jobs
         self.sc.cancelAllJobs()
@@ -226,6 +235,10 @@ class WorkerSegfaultTest(ReusedPySparkTestCase):
             self.assertRegex(str(e), "Segmentation fault")
 
 
+@unittest.skipIf(
+    "COVERAGE_PROCESS_START" in os.environ,
+    "Flaky with coverage enabled, skipping for now."
+)
 class WorkerSegfaultNonDaemonTest(WorkerSegfaultTest):
 
     @classmethod
diff --git a/python/run-tests-with-coverage b/python/run-tests-with-coverage
index 3bc61e5..59409c0 100755
--- a/python/run-tests-with-coverage
+++ b/python/run-tests-with-coverage
@@ -59,6 +59,8 @@ unset COVERAGE_PROCESS_START
 find $COVERAGE_DIR/coverage_data -size 0 -print0 | xargs -0 rm -fr
 echo "Combining collected coverage data under $COVERAGE_DIR/coverage_data"
 $COV_EXEC combine
+echo "Creating XML report file at python/coverage.xml"
+$COV_EXEC xml --ignore-errors --include "pyspark/*"
 echo "Reporting the coverage data at $COVERAGE_DIR/coverage_data/coverage"
 $COV_EXEC report --include "pyspark/*"
 echo "Generating HTML files for PySpark coverage under $COVERAGE_DIR/htmlcov"
diff --git a/python/test_coverage/coverage_daemon.py b/python/test_coverage/coverage_daemon.py
index c87366a..4372135 100644
--- a/python/test_coverage/coverage_daemon.py
+++ b/python/test_coverage/coverage_daemon.py
@@ -17,6 +17,7 @@
 
 import os
 import imp
+import platform
 
 
 # This is a hack to always refer the main code rather than built zip.
@@ -24,19 +25,21 @@ main_code_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 daemon = imp.load_source("daemon", "%s/pyspark/daemon.py" % main_code_dir)
 
 if "COVERAGE_PROCESS_START" in os.environ:
-    worker = imp.load_source("worker", "%s/pyspark/worker.py" % main_code_dir)
+    # PyPy with coverage makes the tests flaky, and CPython is enough for coverage report.
+    if "pypy" not in platform.python_implementation().lower():
+        worker = imp.load_source("worker", "%s/pyspark/worker.py" % main_code_dir)
 
-    def _cov_wrapped(*args, **kwargs):
-        import coverage
-        cov = coverage.coverage(
-            config_file=os.environ["COVERAGE_PROCESS_START"])
-        cov.start()
-        try:
-            worker.main(*args, **kwargs)
-        finally:
-            cov.stop()
-            cov.save()
-    daemon.worker_main = _cov_wrapped
+        def _cov_wrapped(*args, **kwargs):
+            import coverage
+            cov = coverage.coverage(
+                config_file=os.environ["COVERAGE_PROCESS_START"])
+            cov.start()
+            try:
+                worker.main(*args, **kwargs)
+            finally:
+                cov.stop()
+                cov.save()
+        daemon.worker_main = _cov_wrapped
 else:
     raise RuntimeError("COVERAGE_PROCESS_START environment variable is not set, exiting.")
 
diff --git a/python/test_coverage/sitecustomize.py b/python/test_coverage/sitecustomize.py
index 630237a..1f31860 100644
--- a/python/test_coverage/sitecustomize.py
+++ b/python/test_coverage/sitecustomize.py
@@ -19,5 +19,8 @@
 # If this module is defined, it's executed when the Python session begins.
 # `coverage.process_startup()` seeks if COVERAGE_PROCESS_START environment
 # variable is set or not. If set, it starts to run the coverage.
-import coverage
-coverage.process_startup()
+try:
+    import coverage
+    coverage.process_startup()
+except ImportError:
+    pass

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org