You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2022/06/22 15:35:18 UTC
[spark] branch master updated: [SPARK-39529][INFRA] Refactor and merge all related job selection logic into precondition
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 59eee98024d [SPARK-39529][INFRA] Refactor and merge all related job selection logic into precondition
59eee98024d is described below
commit 59eee98024dac42309f2e7196c7e68832317f284
Author: Hyukjin Kwon <gu...@apache.org>
AuthorDate: Wed Jun 22 08:34:57 2022 -0700
[SPARK-39529][INFRA] Refactor and merge all related job selection logic into precondition
### What changes were proposed in this pull request?
This PR borrows the idea from https://github.com/apache/spark/pull/36928 but adds some more changes in order for scheduled jobs to share the `precondition` so all conditional logic is consolidated here.
This PR also adds a new option to `is-changed.py` so dependent modules can be checked together. In this way, we don't have to change `build_and_test.yml` often when we add a new module.
In addition, this PR removes `type` because `precondition` job now replaces it.
Lastly, this PR enables PySpark, SparkR TPC-DS and Docker integration tests for scheduled jobs when applicable.
Closes #36928
### Why are the changes needed?
To make it easier to read.
### Does this PR introduce _any_ user-facing change?
No, dev-only.
### How was this patch tested?
Tested locally and in my fork (https://github.com/HyukjinKwon/spark/actions)
Closes #36940 from HyukjinKwon/SPARK-39529.
Lead-authored-by: Hyukjin Kwon <gu...@apache.org>
Co-authored-by: Enrico Minack <gi...@enrico.minack.dev>
Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
.github/workflows/build_and_test.yml | 93 ++++++++++++++++++++----------------
.github/workflows/build_ansi.yml | 9 +++-
.github/workflows/build_branch32.yml | 10 +++-
.github/workflows/build_branch33.yml | 10 +++-
.github/workflows/build_coverage.yml | 5 +-
.github/workflows/build_hadoop2.yml | 9 +++-
.github/workflows/build_java11.yml | 9 +++-
.github/workflows/build_java17.yml | 9 +++-
.github/workflows/build_scala213.yml | 10 +++-
9 files changed, 116 insertions(+), 48 deletions(-)
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b1e17318e79..ff1a47c256b 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -27,21 +27,27 @@ on:
type: string
default: 8
branch:
+ description: Branch to run the build against
required: false
type: string
default: master
hadoop:
+ description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it.
required: false
type: string
default: hadoop3
- type:
+ envs:
+ description: Additional environment variables to set when running the tests. Should be in JSON format.
required: false
type: string
- default: regular
- envs:
+ default: '{}'
+ jobs:
+ description: >-
+ Jobs to run, and should be in JSON format. The values should be matched with the job's key defined
+ in this file, e.g., build. See precondition job below.
required: false
type: string
- default: "{}"
+ default: ''
jobs:
precondition:
name: Check changes
@@ -67,27 +73,47 @@ jobs:
- name: Check all modules
id: set-outputs
run: |
- # is-changed.py is missing in branch-3.2, and it might run in scheduled build, see also SPARK-39517
- build=true; pyspark=true; sparkr=true; tpcds=true; docker=true;
- if [ -f "./dev/is-changed.py" ]; then
- build=`./dev/is-changed.py -m avro,build,catalyst,core,docker-integration-tests,examples,graphx,hadoop-cloud,hive,hive-thriftserver,kubernetes,kvstore,launcher,mesos,mllib,mllib-local,network-common,network-shuffle,pyspark-core,pyspark-ml,pyspark-mllib,pyspark-pandas,pyspark-pandas-slow,pyspark-resource,pyspark-sql,pyspark-streaming,repl,sketch,spark-ganglia-lgpl,sparkr,sql,sql-kafka-0-10,streaming,streaming-kafka-0-10,streaming-kinesis-asl,tags,unsafe,yarn`
- pyspark=`./dev/is-changed.py -m avro,build,catalyst,core,graphx,hive,kvstore,launcher,mllib,mllib-local,network-common,network-shuffle,pyspark-core,pyspark-ml,pyspark-mllib,pyspark-pandas,pyspark-pandas-slow,pyspark-resource,pyspark-sql,pyspark-streaming,repl,sketch,sql,tags,unsafe`
- sparkr=`./dev/is-changed.py -m avro,build,catalyst,core,hive,kvstore,launcher,mllib,mllib-local,network-common,network-shuffle,repl,sketch,sparkr,sql,tags,unsafe`
- tpcds=`./dev/is-changed.py -m build,catalyst,core,hive,kvstore,launcher,network-common,network-shuffle,repl,sketch,sql,tags,unsafe`
- docker=`./dev/is-changed.py -m build,catalyst,core,docker-integration-tests,hive,kvstore,launcher,network-common,network-shuffle,repl,sketch,sql,tags,unsafe`
+ if [ -z "${{ inputs.jobs }}" ]; then
+ # is-changed.py is missing in branch-3.2, and it might run in scheduled build, see also SPARK-39517
+ pyspark=true; sparkr=true; tpcds=true; docker=true;
+ if [ -f "./dev/is-changed.py" ]; then
+ pyspark_modules=`cd dev && python -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"`
+ pyspark=`./dev/is-changed.py -m $pyspark_modules`
+ sparkr=`./dev/is-changed.py -m sparkr`
+ tpcds=`./dev/is-changed.py -m sql`
+ docker=`./dev/is-changed.py -m docker-integration-tests`
+ fi
+ # 'build', 'scala-213', and 'java-11-17' are always true for now.
+ # It does not save significant time and most of PRs trigger the build.
+ precondition="
+ {
+ \"build\": \"true\",
+ \"pyspark\": \"$pyspark\",
+ \"sparkr\": \"$sparkr\",
+ \"tpcds-1g\": \"$tpcds\",
+ \"docker-integration-tests\": \"$docker\",
+ \"scala-213\": \"true\",
+ \"java-11-17\": \"true\",
+ \"lint\" : \"true\"
+ }"
+ echo $precondition # For debugging
+ # GitHub Actions set-output doesn't take newlines
+ # https://github.community/t/set-output-truncates-multiline-strings/16852/3
+ precondition="${precondition//$'\n'/'%0A'}"
+ echo "::set-output name=required::$precondition"
+ else
+ # This is usually set by scheduled jobs.
+ precondition='${{ inputs.jobs }}'
+ echo $precondition # For debugging
+ precondition="${precondition//$'\n'/'%0A'}"
+ echo "::set-output name=required::$precondition"
fi
- echo "{\"build\": \"$build\", \"pyspark\": \"$pyspark\", \"sparkr\": \"$sparkr\", \"tpcds\": \"$tpcds\", \"docker\": \"$docker\"}" > required.json
- cat required.json
- echo "::set-output name=required::$(cat required.json)"
# Build: build Spark and run the tests for specified modules.
build:
name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }}"
needs: precondition
- # Run scheduled jobs for Apache Spark only
- # Run regular jobs for commit in both Apache Spark and forked repository, but only if changes exist
- if: >-
- inputs.type == 'scheduled' || (inputs.type == 'regular' && fromJson(needs.precondition.outputs.required).build == 'true')
+ if: fromJson(needs.precondition.outputs.required).build == 'true'
# Ubuntu 20.04 is the latest LTS. The next LTS is 22.04.
runs-on: ubuntu-20.04
strategy:
@@ -227,13 +253,7 @@ jobs:
pyspark:
needs: precondition
- # Run PySpark coverage scheduled jobs for Apache Spark only
- # Run scheduled jobs with JDK 17 in Apache Spark
- # Run regular jobs for commit in both Apache Spark and forked repository, but only if pyspark changes exist
- if: >-
- inputs.type == 'pyspark-coverage-scheduled'
- || (inputs.type == 'scheduled' && inputs.java == '17')
- || (inputs.type == 'regular' && fromJson(needs.precondition.outputs.required).pyspark == 'true')
+ if: fromJson(needs.precondition.outputs.required).pyspark == 'true'
name: "Build modules: ${{ matrix.modules }}"
runs-on: ubuntu-20.04
container:
@@ -335,11 +355,7 @@ jobs:
sparkr:
needs: precondition
- # Run scheduled jobs with JDK 17 in Apache Spark
- # Run regular jobs for commit in both Apache Spark and forked repository, but only if sparkr changes exist
- if: >-
- (inputs.type == 'scheduled' && inputs.java == '17')
- || (inputs.type == 'regular' && fromJson(needs.precondition.outputs.required).sparkr == 'true')
+ if: fromJson(needs.precondition.outputs.required).sparkr == 'true'
name: "Build modules: sparkr"
runs-on: ubuntu-20.04
container:
@@ -405,7 +421,8 @@ jobs:
# Static analysis, and documentation build
lint:
- if: inputs.type == 'regular'
+ needs: precondition
+ if: fromJson(needs.precondition.outputs.required).lint == 'true'
name: Linters, licenses, dependencies and documentation generation
runs-on: ubuntu-20.04
env:
@@ -520,8 +537,7 @@ jobs:
java-11-17:
needs: precondition
- # Run regular jobs for commit in both Apache Spark and forked repository, but only if changes exist
- if: inputs.type == 'regular' && fromJson(needs.precondition.outputs.required).build == 'true'
+ if: fromJson(needs.precondition.outputs.required).java-11-17 == 'true'
name: Java ${{ matrix.java }} build with Maven
strategy:
fail-fast: false
@@ -576,8 +592,7 @@ jobs:
scala-213:
needs: precondition
- # Run regular jobs for commit in both Apache Spark and forked repository, but only if changes exist
- if: inputs.type == 'regular' && fromJson(needs.precondition.outputs.required).build == 'true'
+ if: fromJson(needs.precondition.outputs.required).scala-213 == 'true'
name: Scala 2.13 build with SBT
runs-on: ubuntu-20.04
steps:
@@ -622,8 +637,7 @@ jobs:
tpcds-1g:
needs: precondition
- # Run regular jobs for commit in both Apache Spark and forked repository, but only if tpcds changes exist
- if: inputs.type == 'regular' && fromJson(needs.precondition.outputs.required).tpcds == 'true'
+ if: fromJson(needs.precondition.outputs.required).tpcds-1g == 'true'
name: Run TPC-DS queries with SF=1
runs-on: ubuntu-20.04
env:
@@ -720,8 +734,7 @@ jobs:
docker-integration-tests:
needs: precondition
- # Run regular jobs for commit in both Apache Spark and forked repository, but only if docker changes exist
- if: inputs.type == 'regular' && fromJson(needs.precondition.outputs.required).docker == 'true'
+ if: fromJson(needs.precondition.outputs.required).docker-integration-tests == 'true'
name: Run Docker integration tests
runs-on: ubuntu-20.04
env:
diff --git a/.github/workflows/build_ansi.yml b/.github/workflows/build_ansi.yml
index f5c75d3fb7b..bcf02c03fed 100644
--- a/.github/workflows/build_ansi.yml
+++ b/.github/workflows/build_ansi.yml
@@ -32,8 +32,15 @@ jobs:
java: 8
branch: master
hadoop: hadoop3
- type: scheduled
envs: >-
{
"SPARK_ANSI_SQL_MODE": "true",
}
+ jobs: >-
+ {
+ "build": "true",
+ "pyspark": "true",
+ "sparkr": "true",
+ "tpcds-1g": "true",
+ "docker-integration-tests": "true"
+ }
diff --git a/.github/workflows/build_branch32.yml b/.github/workflows/build_branch32.yml
index 12e84068d72..527e5b238a5 100644
--- a/.github/workflows/build_branch32.yml
+++ b/.github/workflows/build_branch32.yml
@@ -32,8 +32,16 @@ jobs:
java: 8
branch: branch-3.2
hadoop: hadoop3.2
- type: scheduled
envs: >-
{
"SCALA_PROFILE": "scala2.13"
}
+ jobs: >-
+ {
+ "build": "true",
+ "pyspark": "true",
+ "sparkr": "true",
+ "tpcds-1g": "true",
+ "docker-integration-tests": "true",
+ "lint" : "true"
+ }
diff --git a/.github/workflows/build_branch33.yml b/.github/workflows/build_branch33.yml
index 1d6c5f56287..e8da76b4d89 100644
--- a/.github/workflows/build_branch33.yml
+++ b/.github/workflows/build_branch33.yml
@@ -32,8 +32,16 @@ jobs:
java: 8
branch: branch-3.3
hadoop: hadoop3
- type: scheduled
envs: >-
{
"SCALA_PROFILE": "scala2.13"
}
+ jobs: >-
+ {
+ "build": "true",
+ "pyspark": "true",
+ "sparkr": "true",
+ "tpcds-1g": "true",
+ "docker-integration-tests": "true",
+ "lint" : "true"
+ }
diff --git a/.github/workflows/build_coverage.yml b/.github/workflows/build_coverage.yml
index 8a9a7f45c14..507bdf9ada6 100644
--- a/.github/workflows/build_coverage.yml
+++ b/.github/workflows/build_coverage.yml
@@ -32,8 +32,11 @@ jobs:
java: 8
branch: master
hadoop: hadoop3
- type: pyspark-coverage-scheduled
envs: >-
{
"PYSPARK_CODECOV": "true"
}
+ jobs: >-
+ {
+ "pyspark": "true"
+ }
diff --git a/.github/workflows/build_hadoop2.yml b/.github/workflows/build_hadoop2.yml
index c15c43e17bc..13a91c1d687 100644
--- a/.github/workflows/build_hadoop2.yml
+++ b/.github/workflows/build_hadoop2.yml
@@ -32,4 +32,11 @@ jobs:
java: 8
branch: master
hadoop: hadoop2
- type: scheduled
+ jobs: >-
+ {
+ "build": "true",
+ "pyspark": "true",
+ "sparkr": "true",
+ "tpcds-1g": "true",
+ "docker-integration-tests": "true"
+ }
diff --git a/.github/workflows/build_java11.yml b/.github/workflows/build_java11.yml
index dfe5884f968..938e2f49a8b 100644
--- a/.github/workflows/build_java11.yml
+++ b/.github/workflows/build_java11.yml
@@ -32,9 +32,16 @@ jobs:
java: 11
branch: master
hadoop: hadoop3
- type: scheduled
envs: >-
{
"SKIP_MIMA": "true",
"SKIP_UNIDOC": "true"
}
+ jobs: >-
+ {
+ "build": "true",
+ "pyspark": "true",
+ "sparkr": "true",
+ "tpcds-1g": "true",
+ "docker-integration-tests": "true"
+ }
diff --git a/.github/workflows/build_java17.yml b/.github/workflows/build_java17.yml
index 4a973ca3991..c1cd85f2b98 100644
--- a/.github/workflows/build_java17.yml
+++ b/.github/workflows/build_java17.yml
@@ -32,9 +32,16 @@ jobs:
java: 17
branch: master
hadoop: hadoop3
- type: scheduled
envs: >-
{
"SKIP_MIMA": "true",
"SKIP_UNIDOC": "true"
}
+ jobs: >-
+ {
+ "build": "true",
+ "pyspark": "true",
+ "sparkr": "true",
+ "tpcds-1g": "true",
+ "docker-integration-tests": "true"
+ }
diff --git a/.github/workflows/build_scala213.yml b/.github/workflows/build_scala213.yml
index 0bc0eabd534..35625ed7673 100644
--- a/.github/workflows/build_scala213.yml
+++ b/.github/workflows/build_scala213.yml
@@ -32,8 +32,16 @@ jobs:
java: 8
branch: master
hadoop: hadoop3
- type: scheduled
envs: >-
{
"SCALA_PROFILE": "scala2.13"
}
+ jobs: >-
+ {
+ "build": "true",
+ "pyspark": "true",
+ "sparkr": "true",
+ "tpcds-1g": "true",
+ "docker-integration-tests": "true",
+ "lint" : "true"
+ }
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org