You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/08/01 12:39:41 UTC
[spark] branch branch-3.2 updated:
[SPARK-36092][INFRA][BUILD][PYTHON] Migrate to GitHub Actions with Codecov
from Jenkins
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.2 by this push:
new 310cd8ee [SPARK-36092][INFRA][BUILD][PYTHON] Migrate to GitHub Actions with Codecov from Jenkins
310cd8ee is described below
commit 310cd8eef1f17a320bb5334280e4aac52692a3d4
Author: Hyukjin Kwon <gu...@apache.org>
AuthorDate: Sun Aug 1 21:37:19 2021 +0900
[SPARK-36092][INFRA][BUILD][PYTHON] Migrate to GitHub Actions with Codecov from Jenkins
This PR proposes to migrate Coverage report from Jenkins to GitHub Actions by setting a dailly cron job.
For some background, currently PySpark code coverage is being reported in this specific Jenkins job: https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7/
Because of the security issue between [Codecov service](https://app.codecov.io/gh/) and Jenkins machines, we had to work around by manually hosting a coverage site via GitHub pages, see also https://spark-test.github.io/pyspark-coverage-site/ by spark-test account (which is shared to only subset of PMC members).
Since we now run the build via GitHub Actions, we can leverage [Codecov plugin](https://github.com/codecov/codecov-action), and remove the workaround we used.
Virtually no. Coverage site (UI) might change but the information it holds should be virtually the same.
I manually tested:
- Scheduled run: https://github.com/HyukjinKwon/spark/actions/runs/1082261484
- Coverage report: https://codecov.io/gh/HyukjinKwon/spark/tree/73f0291a7df1eda98045cd759303aac1c2a9c929/python/pyspark
- Run against a PR: https://github.com/HyukjinKwon/spark/actions/runs/1082367175
Closes #33591 from HyukjinKwon/SPARK-36092.
Authored-by: Hyukjin Kwon <gu...@apache.org>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
(cherry picked from commit c0d1860f256eccd09a07584b8a77e6c60cc74c46)
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
.gitignore | 1 +
README.md | 2 +-
dev/requirements.txt | 3 ++
dev/run-tests.py | 57 ++--------------------
.../mllib/tests/test_streaming_algorithms.py | 13 ++++-
python/pyspark/streaming/tests/test_dstream.py | 9 ++--
python/pyspark/tests/test_context.py | 2 +-
python/pyspark/tests/test_worker.py | 19 ++++++--
python/run-tests-with-coverage | 2 +
python/test_coverage/coverage_daemon.py | 27 +++++-----
python/test_coverage/sitecustomize.py | 7 ++-
11 files changed, 61 insertions(+), 81 deletions(-)
diff --git a/.gitignore b/.gitignore
index 86493a7..1a7881a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -67,6 +67,7 @@ project/plugins/src_managed/
project/plugins/target/
python/lib/pyspark.zip
python/.eggs/
+python/coverage.xml
python/deps
python/docs/_site/
python/docs/source/reference/**/api/
diff --git a/README.md b/README.md
index 7d2c457..a208497 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ and Structured Streaming for stream processing.
[![GitHub Action Build](https://github.com/apache/spark/actions/workflows/build_and_test.yml/badge.svg?branch=master)](https://github.com/apache/spark/actions/workflows/build_and_test.yml?query=branch%3Amaster)
[![Jenkins Build](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-3.2/badge/icon)](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-3.2)
[![AppVeyor Build](https://img.shields.io/appveyor/ci/ApacheSoftwareFoundation/spark/master.svg?style=plastic&logo=appveyor)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/spark)
-[![PySpark Coverage](https://img.shields.io/badge/dynamic/xml.svg?label=pyspark%20coverage&url=https%3A%2F%2Fspark-test.github.io%2Fpyspark-coverage-site&query=%2Fhtml%2Fbody%2Fdiv%5B1%5D%2Fdiv%2Fh1%2Fspan&colorB=brightgreen&style=plastic)](https://spark-test.github.io/pyspark-coverage-site)
+[![PySpark Coverage](https://codecov.io/gh/apache/spark/branch/master/graph/badge.svg)](https://codecov.io/gh/apache/spark)
## Online Documentation
diff --git a/dev/requirements.txt b/dev/requirements.txt
index 34f4b88..273294a 100644
--- a/dev/requirements.txt
+++ b/dev/requirements.txt
@@ -14,6 +14,9 @@ matplotlib<3.3.0
# PySpark test dependencies
xmlrunner
+# PySpark test dependencies (optional)
+coverage
+
# Linter
mypy
flake8
diff --git a/dev/run-tests.py b/dev/run-tests.py
index 507846a..4828bba 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -23,8 +23,6 @@ import os
import re
import sys
import subprocess
-import glob
-import shutil
from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES
from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
@@ -522,54 +520,6 @@ def run_python_tests(test_modules, parallelism, with_coverage=False):
x for x in ["python3.6", "python3.9", "pypy3"] if which(x)))
run_cmd(command)
- if with_coverage:
- post_python_tests_results()
-
-
-def post_python_tests_results():
- if "SPARK_TEST_KEY" not in os.environ:
- print("[error] 'SPARK_TEST_KEY' environment variable was not set. Unable to post "
- "PySpark coverage results.")
- sys.exit(1)
- spark_test_key = os.environ.get("SPARK_TEST_KEY")
- # The steps below upload HTMLs to 'github.com/spark-test/pyspark-coverage-site'.
- # 1. Clone PySpark coverage site.
- run_cmd([
- "git",
- "clone",
- "https://spark-test:%s@github.com/spark-test/pyspark-coverage-site.git" % spark_test_key])
- # 2. Remove existing HTMLs.
- run_cmd(["rm", "-fr"] + glob.glob("pyspark-coverage-site/*"))
- # 3. Copy generated coverage HTMLs.
- for f in glob.glob("%s/python/test_coverage/htmlcov/*" % SPARK_HOME):
- shutil.copy(f, "pyspark-coverage-site/")
- os.chdir("pyspark-coverage-site")
- try:
- # 4. Check out to a temporary branch.
- run_cmd(["git", "symbolic-ref", "HEAD", "refs/heads/latest_branch"])
- # 5. Add all the files.
- run_cmd(["git", "add", "-A"])
- # 6. Commit current HTMLs.
- run_cmd([
- "git",
- "-c",
- "user.name='Apache Spark Test Account'",
- "-c",
- "user.email='sparktestacc@gmail.com'",
- "commit",
- "-am",
- "Coverage report at latest commit in Apache Spark"])
- # 7. Delete the old branch.
- run_cmd(["git", "branch", "-D", "gh-pages"])
- # 8. Rename the temporary branch to master.
- run_cmd(["git", "branch", "-m", "gh-pages"])
- # 9. Finally, force update to our repository.
- run_cmd(["git", "push", "-f", "origin", "gh-pages"])
- finally:
- os.chdir("..")
- # 10. Remove the cloned repository.
- shutil.rmtree("pyspark-coverage-site")
-
def run_python_packaging_tests():
set_title_and_block("Running PySpark packaging tests", "BLOCK_PYSPARK_PIP_TESTS")
@@ -815,11 +765,10 @@ def main():
modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
if modules_with_python_tests:
- # We only run PySpark tests with coverage report in one specific job with
- # Spark master with SBT in Jenkins.
- is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ
run_python_tests(
- modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job)
+ modules_with_python_tests,
+ opts.parallelism,
+ with_coverage=os.environ.get("PYSPARK_CODECOV", "false") == "true")
run_python_packaging_tests()
if any(m.should_run_r_tests for m in test_modules):
run_sparkr_tests()
diff --git a/python/pyspark/mllib/tests/test_streaming_algorithms.py b/python/pyspark/mllib/tests/test_streaming_algorithms.py
index ba0c86f..cee53b6 100644
--- a/python/pyspark/mllib/tests/test_streaming_algorithms.py
+++ b/python/pyspark/mllib/tests/test_streaming_algorithms.py
@@ -15,6 +15,7 @@
# limitations under the License.
#
+import os
import unittest
from numpy import array, random, exp, dot, all, mean, abs
@@ -117,7 +118,7 @@ class StreamingKMeansTest(MLLibStreamingTestCase):
self.assertTrue(all(finalModel.centers == array(initCenters)))
self.assertEqual(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
return True
- eventually(condition, catch_assertions=True)
+ eventually(condition, 90, catch_assertions=True)
def test_predictOn_model(self):
"""Test that the model predicts correctly on toy data."""
@@ -251,7 +252,7 @@ class StreamingLogisticRegressionWithSGDTests(MLLibStreamingTestCase):
return True
# We want all batches to finish for this test.
- eventually(condition, 60.0, catch_assertions=True)
+ eventually(condition, 120, catch_assertions=True)
t_models = array(models)
diff = t_models[1:] - t_models[:-1]
@@ -292,6 +293,10 @@ class StreamingLogisticRegressionWithSGDTests(MLLibStreamingTestCase):
self.assertTrue(
self.calculate_accuracy_error(true, predicted) < 0.4)
+ @unittest.skipIf(
+ "COVERAGE_PROCESS_START" in os.environ,
+ "Flaky with coverage enabled, skipping for now."
+ )
def test_training_and_prediction(self):
"""Test that the model improves on toy data with no. of batches"""
input_batches = [
@@ -428,6 +433,10 @@ class StreamingLinearRegressionWithTests(MLLibStreamingTestCase):
true, predicted = zip(*batch)
self.assertTrue(mean(abs(array(true) - array(predicted))) < 0.1)
+ @unittest.skipIf(
+ "COVERAGE_PROCESS_START" in os.environ,
+ "Flaky with coverage enabled, skipping for now."
+ )
def test_train_prediction(self):
"""Test that error on test data improves as model is trained."""
slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25)
diff --git a/python/pyspark/streaming/tests/test_dstream.py b/python/pyspark/streaming/tests/test_dstream.py
index 596053a..5419ebb 100644
--- a/python/pyspark/streaming/tests/test_dstream.py
+++ b/python/pyspark/streaming/tests/test_dstream.py
@@ -31,8 +31,7 @@ from pyspark.testing.streamingutils import PySparkStreamingTestCase
@unittest.skipIf(
"pypy" in platform.python_implementation().lower(),
- "The tests fail in PyPy3 implementation for an unknown reason. "
- "With PyPy, it causes to hang DStream tests forever when Coverage report is used.")
+ "The tests fail in PyPy3 implementation for an unknown reason.")
class BasicOperationTests(PySparkStreamingTestCase):
def test_map(self):
@@ -396,8 +395,7 @@ class BasicOperationTests(PySparkStreamingTestCase):
@unittest.skipIf(
"pypy" in platform.python_implementation().lower(),
- "The tests fail in PyPy3 implementation for an unknown reason. "
- "With PyPy, it causes to hang DStream tests forever when Coverage report is used.")
+ "The tests fail in PyPy3 implementation for an unknown reason.")
class WindowFunctionTests(PySparkStreamingTestCase):
timeout = 15
@@ -477,8 +475,7 @@ class WindowFunctionTests(PySparkStreamingTestCase):
@unittest.skipIf(
"pypy" in platform.python_implementation().lower(),
- "The tests fail in PyPy3 implementation for an unknown reason. "
- "With PyPy, it causes to hang DStream tests forever when Coverage report is used.")
+ "The tests fail in PyPy3 implementation for an unknown reason.")
class CheckpointTests(unittest.TestCase):
setupCalled = False
diff --git a/python/pyspark/tests/test_context.py b/python/pyspark/tests/test_context.py
index 5015592..4611d03 100644
--- a/python/pyspark/tests/test_context.py
+++ b/python/pyspark/tests/test_context.py
@@ -230,7 +230,7 @@ class ContextTests(unittest.TestCase):
t.daemon = True
t.start()
# wait for scheduler to start
- time.sleep(1)
+ time.sleep(3)
tracker = sc.statusTracker()
jobIds = tracker.getJobIdsForGroup('test_progress_api')
diff --git a/python/pyspark/tests/test_worker.py b/python/pyspark/tests/test_worker.py
index a77d38e..ebf1e93 100644
--- a/python/pyspark/tests/test_worker.py
+++ b/python/pyspark/tests/test_worker.py
@@ -57,13 +57,22 @@ class WorkerTests(ReusedPySparkTestCase):
t.start()
daemon_pid, worker_pid = 0, 0
+ cnt = 0
while True:
if os.path.exists(path):
with open(path) as f:
data = f.read().split(' ')
- daemon_pid, worker_pid = map(int, data)
- break
- time.sleep(0.1)
+ try:
+ daemon_pid, worker_pid = map(int, data)
+ except ValueError:
+ pass
+ # In case the value is not written yet.
+ cnt += 1
+ if cnt == 10:
+ raise
+ else:
+ break
+ time.sleep(1)
# cancel jobs
self.sc.cancelAllJobs()
@@ -226,6 +235,10 @@ class WorkerSegfaultTest(ReusedPySparkTestCase):
self.assertRegex(str(e), "Segmentation fault")
+@unittest.skipIf(
+ "COVERAGE_PROCESS_START" in os.environ,
+ "Flaky with coverage enabled, skipping for now."
+)
class WorkerSegfaultNonDaemonTest(WorkerSegfaultTest):
@classmethod
diff --git a/python/run-tests-with-coverage b/python/run-tests-with-coverage
index 3bc61e5..59409c0 100755
--- a/python/run-tests-with-coverage
+++ b/python/run-tests-with-coverage
@@ -59,6 +59,8 @@ unset COVERAGE_PROCESS_START
find $COVERAGE_DIR/coverage_data -size 0 -print0 | xargs -0 rm -fr
echo "Combining collected coverage data under $COVERAGE_DIR/coverage_data"
$COV_EXEC combine
+echo "Creating XML report file at python/coverage.xml"
+$COV_EXEC xml --ignore-errors --include "pyspark/*"
echo "Reporting the coverage data at $COVERAGE_DIR/coverage_data/coverage"
$COV_EXEC report --include "pyspark/*"
echo "Generating HTML files for PySpark coverage under $COVERAGE_DIR/htmlcov"
diff --git a/python/test_coverage/coverage_daemon.py b/python/test_coverage/coverage_daemon.py
index c87366a..4372135 100644
--- a/python/test_coverage/coverage_daemon.py
+++ b/python/test_coverage/coverage_daemon.py
@@ -17,6 +17,7 @@
import os
import imp
+import platform
# This is a hack to always refer the main code rather than built zip.
@@ -24,19 +25,21 @@ main_code_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
daemon = imp.load_source("daemon", "%s/pyspark/daemon.py" % main_code_dir)
if "COVERAGE_PROCESS_START" in os.environ:
- worker = imp.load_source("worker", "%s/pyspark/worker.py" % main_code_dir)
+ # PyPy with coverage makes the tests flaky, and CPython is enough for coverage report.
+ if "pypy" not in platform.python_implementation().lower():
+ worker = imp.load_source("worker", "%s/pyspark/worker.py" % main_code_dir)
- def _cov_wrapped(*args, **kwargs):
- import coverage
- cov = coverage.coverage(
- config_file=os.environ["COVERAGE_PROCESS_START"])
- cov.start()
- try:
- worker.main(*args, **kwargs)
- finally:
- cov.stop()
- cov.save()
- daemon.worker_main = _cov_wrapped
+ def _cov_wrapped(*args, **kwargs):
+ import coverage
+ cov = coverage.coverage(
+ config_file=os.environ["COVERAGE_PROCESS_START"])
+ cov.start()
+ try:
+ worker.main(*args, **kwargs)
+ finally:
+ cov.stop()
+ cov.save()
+ daemon.worker_main = _cov_wrapped
else:
raise RuntimeError("COVERAGE_PROCESS_START environment variable is not set, exiting.")
diff --git a/python/test_coverage/sitecustomize.py b/python/test_coverage/sitecustomize.py
index 630237a..1f31860 100644
--- a/python/test_coverage/sitecustomize.py
+++ b/python/test_coverage/sitecustomize.py
@@ -19,5 +19,8 @@
# If this module is defined, it's executed when the Python session begins.
# `coverage.process_startup()` seeks if COVERAGE_PROCESS_START environment
# variable is set or not. If set, it starts to run the coverage.
-import coverage
-coverage.process_startup()
+try:
+ import coverage
+ coverage.process_startup()
+except ImportError:
+ pass
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org