You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2020/07/09 14:53:02 UTC
[GitHub] [spark] dongjoon-hyun commented on a change in pull request #29057: [SPARK-32245][INFRA] Run Spark tests in Github Actions

dongjoon-hyun commented on a change in pull request #29057:
URL: https://github.com/apache/spark/pull/29057#discussion_r452278013



##########
File path: .github/workflows/master.yml
##########
@@ -9,148 +9,233 @@ on:
     - master
 
 jobs:
+  # TODO(SPARK-32248): Recover JDK 11 builds
+  # Build: build Spark and run the tests for specified modules.
   build:
-
+    name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
-        java: [ '1.8', '11' ]
-        hadoop: [ 'hadoop-2.7', 'hadoop-3.2' ]
-        hive: [ 'hive-1.2', 'hive-2.3' ]
-        exclude:
-        - java: '11'
-          hive: 'hive-1.2'
-        - hadoop: 'hadoop-3.2'
-          hive: 'hive-1.2'
-    name: Build Spark - JDK${{ matrix.java }}/${{ matrix.hadoop }}/${{ matrix.hive }}
-
+        java:
+          - 1.8
+        hadoop:
+          - hadoop3.2
+        hive:
+          - hive2.3
+        # TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now.
+        # Kinesis tests depends on external Amazon kinesis service.
+        # Note that the modules below are from sparktestsupport/modules.py.
+        modules:
+          - |-
+            core, unsafe, kvstore, avro,
+            network_common, network_shuffle, repl, launcher
+            examples, sketch, graphx
+          - |-
+            catalyst, sql
+          - |-
+            hive-thriftserver
+          - |-
+            streaming, sql-kafka-0-10, streaming-kafka-0-10
+          - |-
+            mllib-local, mllib
+          - |-
+            pyspark-sql, pyspark-mllib, pyspark-resource
+          - |-
+            pyspark-core, pyspark-streaming, pyspark-ml
+          - |-
+            sparkr
+          - |-
+            yarn, mesos, kubernetes, hadoop-cloud,
+            spark-ganglia-lgpl
+        # Here, we split Hive tests into some of heavy ones and the rest of them.
+        included-tags: [""]
+        excluded-tags: [""]
+        comment: ["- all tests"]
+        include:
+          - modules: hive
+            java: 1.8
+            hadoop: hadoop3.2
+            hive: hive2.3
+            included-tags: org.apache.spark.tags.HeavyHiveTest
+            comment: "- heavy tests"
+          - modules: hive
+            java: 1.8
+            hadoop: hadoop3.2
+            hive: hive2.3
+            excluded-tags: org.apache.spark.tags.HeavyHiveTest
+            comment: "- light tests"
+    env:
+      TEST_ONLY_MODULES: ${{ matrix.modules }}
+      HADOOP_PROFILE: ${{ matrix.hadoop }}
+      HIVE_PROFILE: ${{ matrix.hive }}
+      # Github Actions' default miniconda
+      CONDA_PREFIX: /usr/share/miniconda
+      # Don't run the tests in parallel due to flakiness. See SparkParallelTestGrouping.
+      TEST_ONLY_EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
+      TEST_ONLY_INCLUDED_TAGS: ${{ matrix.included-tags }}
     steps:
-    - uses: actions/checkout@master
-    # We split caches because GitHub Action Cache has a 400MB-size limit.
-    - uses: actions/cache@v1
+    - name: Checkout Spark repository
+      uses: actions/checkout@v2
+    # Cache local repositories. Note that Github Actions cache has a 2G limit.
+    - name: Cache Scala, SBT, Maven and Zinc
+      uses: actions/cache@v1
       with:
         path: build
         key: build-${{ hashFiles('**/pom.xml') }}
         restore-keys: |
           build-
-    - uses: actions/cache@v1
-      with:
-        path: ~/.m2/repository/com
-        key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-com-${{ hashFiles('**/pom.xml') }}
-        restore-keys: |
-          ${{ matrix.java }}-${{ matrix.hadoop }}-maven-com-
-    - uses: actions/cache@v1
+    - name: Cache Maven local repository
+      uses: actions/cache@v2
       with:
-        path: ~/.m2/repository/org
-        key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-org-${{ hashFiles('**/pom.xml') }}
-        restore-keys: |
-          ${{ matrix.java }}-${{ matrix.hadoop }}-maven-org-
-    - uses: actions/cache@v1
-      with:
-        path: ~/.m2/repository/net
-        key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-net-${{ hashFiles('**/pom.xml') }}
+        path: ~/.m2/repository
+        key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-${{ hashFiles('**/pom.xml') }}
         restore-keys: |
-          ${{ matrix.java }}-${{ matrix.hadoop }}-maven-net-
-    - uses: actions/cache@v1
+          ${{ matrix.java }}-${{ matrix.hadoop }}-maven-
+    - name: Cache Ivy local repository
+      uses: actions/cache@v2
       with:
-        path: ~/.m2/repository/io
-        key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-io-${{ hashFiles('**/pom.xml') }}
+        path: ~/.ivy2/cache
+        key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml') }}-${{ hashFiles('**/plugins.sbt') }}
         restore-keys: |
-          ${{ matrix.java }}-${{ matrix.hadoop }}-maven-io-
-    - name: Set up JDK ${{ matrix.java }}
+          ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-
+    - name: Install JDK ${{ matrix.java }}
       uses: actions/setup-java@v1
       with:
         java-version: ${{ matrix.java }}
-    - name: Build with Maven
+    # PySpark
+    - name: Install PyPy3
+      # SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
+      # Note that order of Python installations here matters because default python3 is
+      # overridden by pypy3.
+      uses: actions/setup-python@v2
+      if: contains(matrix.modules, 'pyspark') || matrix.modules == 'sql'
+      with:
+        python-version: pypy3
+        architecture: x64
+    - name: Install Python 2.7
+      uses: actions/setup-python@v2
+      if: contains(matrix.modules, 'pyspark') || matrix.modules == 'sql'
+      with:
+        python-version: 2.7
+        architecture: x64
+    - name: Install Python 3.6
+      uses: actions/setup-python@v2
+      if: contains(matrix.modules, 'pyspark') || matrix.modules == 'sql'
+      with:
+        python-version: 3.6
+        architecture: x64
+    - name: Install Python packages
+      if: contains(matrix.modules, 'pyspark') || matrix.modules == 'sql'
+      # PyArrow is not supported in PyPy yet, see ARROW-2651.
+      # TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason.
+      run: |
+        python3 -m pip install numpy pyarrow pandas scipy
+        python3 -m pip list
+        python2 -m pip install numpy pyarrow pandas scipy

Review comment:
       Oh, do we need `Python2`?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org