You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2022/06/22 15:35:18 UTC
[spark] branch master updated: [SPARK-39529][INFRA] Refactor and merge all related job selection logic into precondition

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 59eee98024d [SPARK-39529][INFRA] Refactor and merge all related job selection logic into precondition
59eee98024d is described below

commit 59eee98024dac42309f2e7196c7e68832317f284
Author: Hyukjin Kwon <gu...@apache.org>
AuthorDate: Wed Jun 22 08:34:57 2022 -0700

    [SPARK-39529][INFRA] Refactor and merge all related job selection logic into precondition
    
    ### What changes were proposed in this pull request?
    
    This PR borrows the idea from https://github.com/apache/spark/pull/36928 but adds some more changes in order for scheduled jobs to share the `precondition` so all conditional logic is consolidated here.
    
    This PR also adds a new option to `is-changed.py` so dependent modules can be checked together. In this way, we don't have to change `build_and_test.yml` often when we add a new module.
    
    In addition, this PR removes `type` because `precondition` job now replaces it.
    
    Lastly, this PR enables PySpark, SparkR TPC-DS and Docker integration tests for scheduled jobs when applicable.
    
    Closes #36928
    
    ### Why are the changes needed?
    
    To make it easier to read.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No, dev-only.
    
    ### How was this patch tested?
    
    Tested locally and in my fork (https://github.com/HyukjinKwon/spark/actions)
    
    Closes #36940 from HyukjinKwon/SPARK-39529.
    
    Lead-authored-by: Hyukjin Kwon <gu...@apache.org>
    Co-authored-by: Enrico Minack <gi...@enrico.minack.dev>
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
 .github/workflows/build_and_test.yml | 93 ++++++++++++++++++++----------------
 .github/workflows/build_ansi.yml     |  9 +++-
 .github/workflows/build_branch32.yml | 10 +++-
 .github/workflows/build_branch33.yml | 10 +++-
 .github/workflows/build_coverage.yml |  5 +-
 .github/workflows/build_hadoop2.yml  |  9 +++-
 .github/workflows/build_java11.yml   |  9 +++-
 .github/workflows/build_java17.yml   |  9 +++-
 .github/workflows/build_scala213.yml | 10 +++-
 9 files changed, 116 insertions(+), 48 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b1e17318e79..ff1a47c256b 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -27,21 +27,27 @@ on:
         type: string
         default: 8
       branch:
+        description: Branch to run the build against
         required: false
         type: string
         default: master
       hadoop:
+        description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it.
         required: false
         type: string
         default: hadoop3
-      type:
+      envs:
+        description: Additional environment variables to set when running the tests. Should be in JSON format.
         required: false
         type: string
-        default: regular
-      envs:
+        default: '{}'
+      jobs:
+        description: >-
+          Jobs to run, and should be in JSON format. The values should be matched with the job's key defined
+          in this file, e.g., build. See precondition job below.
         required: false
         type: string
-        default: "{}"
+        default: ''
 jobs:
   precondition:
     name: Check changes
@@ -67,27 +73,47 @@ jobs:
     - name: Check all modules
       id: set-outputs
       run: |
-        # is-changed.py is missing in branch-3.2, and it might run in scheduled build, see also SPARK-39517
-        build=true; pyspark=true; sparkr=true; tpcds=true; docker=true;
-        if [ -f "./dev/is-changed.py" ]; then
-          build=`./dev/is-changed.py -m avro,build,catalyst,core,docker-integration-tests,examples,graphx,hadoop-cloud,hive,hive-thriftserver,kubernetes,kvstore,launcher,mesos,mllib,mllib-local,network-common,network-shuffle,pyspark-core,pyspark-ml,pyspark-mllib,pyspark-pandas,pyspark-pandas-slow,pyspark-resource,pyspark-sql,pyspark-streaming,repl,sketch,spark-ganglia-lgpl,sparkr,sql,sql-kafka-0-10,streaming,streaming-kafka-0-10,streaming-kinesis-asl,tags,unsafe,yarn`
-          pyspark=`./dev/is-changed.py -m avro,build,catalyst,core,graphx,hive,kvstore,launcher,mllib,mllib-local,network-common,network-shuffle,pyspark-core,pyspark-ml,pyspark-mllib,pyspark-pandas,pyspark-pandas-slow,pyspark-resource,pyspark-sql,pyspark-streaming,repl,sketch,sql,tags,unsafe`
-          sparkr=`./dev/is-changed.py -m avro,build,catalyst,core,hive,kvstore,launcher,mllib,mllib-local,network-common,network-shuffle,repl,sketch,sparkr,sql,tags,unsafe`
-          tpcds=`./dev/is-changed.py -m build,catalyst,core,hive,kvstore,launcher,network-common,network-shuffle,repl,sketch,sql,tags,unsafe`
-          docker=`./dev/is-changed.py -m build,catalyst,core,docker-integration-tests,hive,kvstore,launcher,network-common,network-shuffle,repl,sketch,sql,tags,unsafe`
+        if [ -z "${{ inputs.jobs }}" ]; then
+          # is-changed.py is missing in branch-3.2, and it might run in scheduled build, see also SPARK-39517
+          pyspark=true; sparkr=true; tpcds=true; docker=true;
+          if [ -f "./dev/is-changed.py" ]; then
+            pyspark_modules=`cd dev && python -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"`
+            pyspark=`./dev/is-changed.py -m $pyspark_modules`
+            sparkr=`./dev/is-changed.py -m sparkr`
+            tpcds=`./dev/is-changed.py -m sql`
+            docker=`./dev/is-changed.py -m docker-integration-tests`
+          fi
+          # 'build', 'scala-213', and 'java-11-17' are always true for now.
+          # It does not save significant time and most of PRs trigger the build.
+          precondition="
+            {
+              \"build\": \"true\",
+              \"pyspark\": \"$pyspark\",
+              \"sparkr\": \"$sparkr\",
+              \"tpcds-1g\": \"$tpcds\",
+              \"docker-integration-tests\": \"$docker\",
+              \"scala-213\": \"true\",
+              \"java-11-17\": \"true\",
+              \"lint\" : \"true\"
+            }"
+          echo $precondition # For debugging
+          # GitHub Actions set-output doesn't take newlines
+          # https://github.community/t/set-output-truncates-multiline-strings/16852/3
+          precondition="${precondition//$'\n'/'%0A'}"
+          echo "::set-output name=required::$precondition"
+        else
+          # This is usually set by scheduled jobs.
+          precondition='${{ inputs.jobs }}'
+          echo $precondition # For debugging
+          precondition="${precondition//$'\n'/'%0A'}"
+          echo "::set-output name=required::$precondition"
         fi
-        echo "{\"build\": \"$build\", \"pyspark\": \"$pyspark\", \"sparkr\": \"$sparkr\", \"tpcds\": \"$tpcds\", \"docker\": \"$docker\"}" > required.json
-        cat required.json
-        echo "::set-output name=required::$(cat required.json)"
 
   # Build: build Spark and run the tests for specified modules.
   build:
     name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }}"
     needs: precondition
-    # Run scheduled jobs for Apache Spark only
-    # Run regular jobs for commit in both Apache Spark and forked repository, but only if changes exist
-    if: >-
-      inputs.type == 'scheduled' || (inputs.type == 'regular' && fromJson(needs.precondition.outputs.required).build == 'true')
+    if: fromJson(needs.precondition.outputs.required).build == 'true'
     # Ubuntu 20.04 is the latest LTS. The next LTS is 22.04.
     runs-on: ubuntu-20.04
     strategy:
@@ -227,13 +253,7 @@ jobs:
 
   pyspark:
     needs: precondition
-    # Run PySpark coverage scheduled jobs for Apache Spark only
-    # Run scheduled jobs with JDK 17 in Apache Spark
-    # Run regular jobs for commit in both Apache Spark and forked repository, but only if pyspark changes exist
-    if: >-
-      inputs.type == 'pyspark-coverage-scheduled'
-      || (inputs.type == 'scheduled' && inputs.java == '17')
-      || (inputs.type == 'regular' && fromJson(needs.precondition.outputs.required).pyspark == 'true')
+    if: fromJson(needs.precondition.outputs.required).pyspark == 'true'
     name: "Build modules: ${{ matrix.modules }}"
     runs-on: ubuntu-20.04
     container:
@@ -335,11 +355,7 @@ jobs:
 
   sparkr:
     needs: precondition
-    # Run scheduled jobs with JDK 17 in Apache Spark
-    # Run regular jobs for commit in both Apache Spark and forked repository, but only if sparkr changes exist
-    if: >-
-      (inputs.type == 'scheduled' && inputs.java == '17')
-      || (inputs.type == 'regular' && fromJson(needs.precondition.outputs.required).sparkr == 'true')
+    if: fromJson(needs.precondition.outputs.required).sparkr == 'true'
     name: "Build modules: sparkr"
     runs-on: ubuntu-20.04
     container:
@@ -405,7 +421,8 @@ jobs:
 
   # Static analysis, and documentation build
   lint:
-    if: inputs.type == 'regular'
+    needs: precondition
+    if: fromJson(needs.precondition.outputs.required).lint == 'true'
     name: Linters, licenses, dependencies and documentation generation
     runs-on: ubuntu-20.04
     env:
@@ -520,8 +537,7 @@ jobs:
 
   java-11-17:
     needs: precondition
-    # Run regular jobs for commit in both Apache Spark and forked repository, but only if changes exist
-    if: inputs.type == 'regular' && fromJson(needs.precondition.outputs.required).build == 'true'
+    if: fromJson(needs.precondition.outputs.required).java-11-17 == 'true'
     name: Java ${{ matrix.java }} build with Maven
     strategy:
       fail-fast: false
@@ -576,8 +592,7 @@ jobs:
 
   scala-213:
     needs: precondition
-    # Run regular jobs for commit in both Apache Spark and forked repository, but only if changes exist
-    if: inputs.type == 'regular' && fromJson(needs.precondition.outputs.required).build == 'true'
+    if: fromJson(needs.precondition.outputs.required).scala-213 == 'true'
     name: Scala 2.13 build with SBT
     runs-on: ubuntu-20.04
     steps:
@@ -622,8 +637,7 @@ jobs:
 
   tpcds-1g:
     needs: precondition
-    # Run regular jobs for commit in both Apache Spark and forked repository, but only if tpcds changes exist
-    if: inputs.type == 'regular' && fromJson(needs.precondition.outputs.required).tpcds == 'true'
+    if: fromJson(needs.precondition.outputs.required).tpcds-1g == 'true'
     name: Run TPC-DS queries with SF=1
     runs-on: ubuntu-20.04
     env:
@@ -720,8 +734,7 @@ jobs:
 
   docker-integration-tests:
     needs: precondition
-    # Run regular jobs for commit in both Apache Spark and forked repository, but only if docker changes exist
-    if: inputs.type == 'regular' && fromJson(needs.precondition.outputs.required).docker == 'true'
+    if: fromJson(needs.precondition.outputs.required).docker-integration-tests == 'true'
     name: Run Docker integration tests
     runs-on: ubuntu-20.04
     env:
diff --git a/.github/workflows/build_ansi.yml b/.github/workflows/build_ansi.yml
index f5c75d3fb7b..bcf02c03fed 100644
--- a/.github/workflows/build_ansi.yml
+++ b/.github/workflows/build_ansi.yml
@@ -32,8 +32,15 @@ jobs:
       java: 8
       branch: master
       hadoop: hadoop3
-      type: scheduled
       envs: >-
         {
           "SPARK_ANSI_SQL_MODE": "true",
         }
+      jobs: >-
+        {
+          "build": "true",
+          "pyspark": "true",
+          "sparkr": "true",
+          "tpcds-1g": "true",
+          "docker-integration-tests": "true"
+        }
diff --git a/.github/workflows/build_branch32.yml b/.github/workflows/build_branch32.yml
index 12e84068d72..527e5b238a5 100644
--- a/.github/workflows/build_branch32.yml
+++ b/.github/workflows/build_branch32.yml
@@ -32,8 +32,16 @@ jobs:
       java: 8
       branch: branch-3.2
       hadoop: hadoop3.2
-      type: scheduled
       envs: >-
         {
           "SCALA_PROFILE": "scala2.13"
         }
+      jobs: >-
+        {
+          "build": "true",
+          "pyspark": "true",
+          "sparkr": "true",
+          "tpcds-1g": "true",
+          "docker-integration-tests": "true",
+          "lint" : "true"
+        }
diff --git a/.github/workflows/build_branch33.yml b/.github/workflows/build_branch33.yml
index 1d6c5f56287..e8da76b4d89 100644
--- a/.github/workflows/build_branch33.yml
+++ b/.github/workflows/build_branch33.yml
@@ -32,8 +32,16 @@ jobs:
       java: 8
       branch: branch-3.3
       hadoop: hadoop3
-      type: scheduled
       envs: >-
         {
           "SCALA_PROFILE": "scala2.13"
         }
+      jobs: >-
+        {
+          "build": "true",
+          "pyspark": "true",
+          "sparkr": "true",
+          "tpcds-1g": "true",
+          "docker-integration-tests": "true",
+          "lint" : "true"
+        }
diff --git a/.github/workflows/build_coverage.yml b/.github/workflows/build_coverage.yml
index 8a9a7f45c14..507bdf9ada6 100644
--- a/.github/workflows/build_coverage.yml
+++ b/.github/workflows/build_coverage.yml
@@ -32,8 +32,11 @@ jobs:
       java: 8
       branch: master
       hadoop: hadoop3
-      type: pyspark-coverage-scheduled
       envs: >-
         {
           "PYSPARK_CODECOV": "true"
         }
+      jobs: >-
+        {
+          "pyspark": "true"
+        }
diff --git a/.github/workflows/build_hadoop2.yml b/.github/workflows/build_hadoop2.yml
index c15c43e17bc..13a91c1d687 100644
--- a/.github/workflows/build_hadoop2.yml
+++ b/.github/workflows/build_hadoop2.yml
@@ -32,4 +32,11 @@ jobs:
       java: 8
       branch: master
       hadoop: hadoop2
-      type: scheduled
+      jobs: >-
+        {
+          "build": "true",
+          "pyspark": "true",
+          "sparkr": "true",
+          "tpcds-1g": "true",
+          "docker-integration-tests": "true"
+        }
diff --git a/.github/workflows/build_java11.yml b/.github/workflows/build_java11.yml
index dfe5884f968..938e2f49a8b 100644
--- a/.github/workflows/build_java11.yml
+++ b/.github/workflows/build_java11.yml
@@ -32,9 +32,16 @@ jobs:
       java: 11
       branch: master
       hadoop: hadoop3
-      type: scheduled
       envs: >-
         {
           "SKIP_MIMA": "true",
           "SKIP_UNIDOC": "true"
         }
+      jobs: >-
+        {
+          "build": "true",
+          "pyspark": "true",
+          "sparkr": "true",
+          "tpcds-1g": "true",
+          "docker-integration-tests": "true"
+        }
diff --git a/.github/workflows/build_java17.yml b/.github/workflows/build_java17.yml
index 4a973ca3991..c1cd85f2b98 100644
--- a/.github/workflows/build_java17.yml
+++ b/.github/workflows/build_java17.yml
@@ -32,9 +32,16 @@ jobs:
       java: 17
       branch: master
       hadoop: hadoop3
-      type: scheduled
       envs: >-
         {
           "SKIP_MIMA": "true",
           "SKIP_UNIDOC": "true"
         }
+      jobs: >-
+        {
+          "build": "true",
+          "pyspark": "true",
+          "sparkr": "true",
+          "tpcds-1g": "true",
+          "docker-integration-tests": "true"
+        }
diff --git a/.github/workflows/build_scala213.yml b/.github/workflows/build_scala213.yml
index 0bc0eabd534..35625ed7673 100644
--- a/.github/workflows/build_scala213.yml
+++ b/.github/workflows/build_scala213.yml
@@ -32,8 +32,16 @@ jobs:
       java: 8
       branch: master
       hadoop: hadoop3
-      type: scheduled
       envs: >-
         {
           "SCALA_PROFILE": "scala2.13"
         }
+      jobs: >-
+        {
+          "build": "true",
+          "pyspark": "true",
+          "sparkr": "true",
+          "tpcds-1g": "true",
+          "docker-integration-tests": "true",
+          "lint" : "true"
+        }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org