You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2021/02/05 00:28:44 UTC

[arrow] branch master updated: ARROW-10457: [CI] Fix Spark integration tests with branch-3.0

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 9e88889  ARROW-10457: [CI] Fix Spark integration tests with branch-3.0
9e88889 is described below

commit 9e88889d5e7e2c52cc92f3a091fdd955d2370819
Author: Bryan Cutler <cu...@gmail.com>
AuthorDate: Fri Feb 5 09:27:22 2021 +0900

    ARROW-10457: [CI] Fix Spark integration tests with branch-3.0
    
    This adds an additional docker-compose task that will test a Spark branch with the latest PyArrow, instead of the default which also builds Spark with the latest Arrow Java. This is so existing Spark releases with fixed Arrow Java dependencies can still be tested with current PyArrow changes to ensure there is still compatibility. The nightly test against Spark branch-3.0 is changed to use this new task.
    
    Closes #9210 from BryanCutler/ci-spark-branch-3.0-ARROW-10457
    
    Authored-by: Bryan Cutler <cu...@gmail.com>
    Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
 ci/docker/conda-python-spark.dockerfile |  6 ++---
 ci/scripts/integration_spark.sh         | 40 +++++++++++++++++++++++----------
 dev/tasks/tasks.yml                     |  3 ++-
 docker-compose.yml                      |  2 +-
 4 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile
index d3f0a22..a2af2ac 100644
--- a/ci/docker/conda-python-spark.dockerfile
+++ b/ci/docker/conda-python-spark.dockerfile
@@ -23,12 +23,10 @@ FROM ${repo}:${arch}-conda-python-${python}
 ARG jdk=8
 ARG maven=3.5
 
-# The Spark tests currently break with pandas >= 1.0
 RUN conda install -q \
-        patch \
-        pandas=0.25.3 \
         openjdk=${jdk} \
-        maven=${maven} && \
+        maven=${maven} \
+        pandas && \
     conda clean --all
 
 # installing specific version of spark
diff --git a/ci/scripts/integration_spark.sh b/ci/scripts/integration_spark.sh
index a45ed7a..a53a629 100755
--- a/ci/scripts/integration_spark.sh
+++ b/ci/scripts/integration_spark.sh
@@ -20,6 +20,11 @@ set -eu
 
 source_dir=${1}
 spark_dir=${2}
+
+# Test Spark with latest PyArrow only, don't build with latest Arrow Java
+test_pyarrow_only=${3:-false}
+
+# Spark branch to checkout
 spark_version=${SPARK_VERSION:-master}
 
 # Use old behavior that always dropped tiemzones.
@@ -30,6 +35,7 @@ if [ "${SPARK_VERSION:0:2}" == "2." ]; then
   export ARROW_PRE_0_15_IPC_FORMAT=1
 fi
 
+# Get Arrow Java version
 pushd ${source_dir}/java
   arrow_version=`mvn org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate -Dexpression=project.version | sed -n -e '/^\[.*\]/ !{ /^[0-9]/ { p; q } }'`
 popd
@@ -37,23 +43,33 @@ popd
 export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m -Dorg.slf4j.simpleLogger.defaultLogLevel=warn"
 export MAVEN_OPTS="${MAVEN_OPTS} -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn"
 
-# build Spark with Arrow
 pushd ${spark_dir}
-  # update Spark pom with the Arrow version just installed and build Spark, need package phase for pyspark
-  echo "Building Spark with Arrow ${arrow_version}"
-  mvn versions:set-property -Dproperty=arrow.version -DnewVersion=${arrow_version}
 
-  build/mvn -B -DskipTests package -pl sql/core -pl assembly -am
+  if [ "${test_pyarrow_only}" == "true" ]; then
+    echo "Building Spark ${SPARK_VERSION} to test pyarrow only"
+
+    # Build Spark only
+    build/mvn -B -DskipTests package
+
+  else
+
+    # Update Spark pom with the Arrow version just installed and build Spark, need package phase for pyspark
+    echo "Building Spark ${SPARK_VERSION} with Arrow ${arrow_version}"
+    mvn versions:set-property -Dproperty=arrow.version -DnewVersion=${arrow_version}
+
+    # Build Spark with new Arrow Java
+    build/mvn -B -DskipTests package -pl sql/core -pl assembly -am
 
-  spark_scala_tests=(
-    "org.apache.spark.sql.execution.arrow"
-    "org.apache.spark.sql.execution.vectorized.ColumnarBatchSuite"
-    "org.apache.spark.sql.execution.vectorized.ArrowColumnVectorSuite")
+    spark_scala_tests=(
+      "org.apache.spark.sql.execution.arrow"
+      "org.apache.spark.sql.execution.vectorized.ColumnarBatchSuite"
+      "org.apache.spark.sql.execution.vectorized.ArrowColumnVectorSuite")
 
-  (echo "Testing Spark:"; IFS=$'\n'; echo "${spark_scala_tests[*]}")
+    (echo "Testing Spark:"; IFS=$'\n'; echo "${spark_scala_tests[*]}")
 
-  # TODO: should be able to only build spark-sql tests with adding "-pl sql/core" but not currently working
-  build/mvn -B -Dtest=none -DwildcardSuites=$(IFS=,; echo "${spark_scala_tests[*]}") test
+    # TODO: should be able to only build spark-sql tests with adding "-pl sql/core" but not currently working
+    build/mvn -B -Dtest=none -DwildcardSuites=$(IFS=,; echo "${spark_scala_tests[*]}") test
+  fi
 
   # Run pyarrow related Python tests only
   spark_python_tests=(
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 1faa13b..a0ebe33 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -1943,7 +1943,8 @@ tasks:
       env:
         PYTHON: 3.7
         SPARK: "branch-3.0"
-      # use the master branch of spark, so prevent reusing any layers
+        TEST_PYARROW_ONLY: "true"
+      # use the branch-3.0 of spark, so prevent reusing any layers
       run: --no-leaf-cache conda-python-spark
 
   test-conda-python-3.8-spark-master:
diff --git a/docker-compose.yml b/docker-compose.yml
index bfdc1aa..ca89897 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1339,4 +1339,4 @@ services:
       ["/arrow/ci/scripts/cpp_build.sh /arrow /build &&
         /arrow/ci/scripts/python_build.sh /arrow /build &&
         /arrow/ci/scripts/java_build.sh /arrow /build &&
-        /arrow/ci/scripts/integration_spark.sh /arrow /spark"]
+        /arrow/ci/scripts/integration_spark.sh /arrow /spark ${TEST_PYARROW_ONLY:-false}"]