You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2020/03/23 22:03:48 UTC

[impala] 01/03: IMPALA-9107 (part 1): Add scripts to produce an m2 archive

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 603e5147d59a4f6a79e5cfeaf9d1421d01bf2d09
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Mon Oct 28 13:40:25 2019 -0700

    IMPALA-9107 (part 1): Add scripts to produce an m2 archive
    
    The maven build downloads a large number of artifacts from
    various maven repositories. When starting with an empty .m2
    directory (like most upstream Jenkins jobs), downloading
    all the artifacts can take up to 30 minutes. This has been
    slowing down our precommit builds by 15-20 minutes.
    
    This adds a script to archive the .m2 directory into a
    tarball while excluding artifacts from impala.cdp.repo
    and impala.cdh.repo. This will later be used to prepopulate
    the .m2 directory for Jenkins jobs.
    
    This adds a script to parse the maven log and output how
    many maven artifacts are downloaded from each repository.
    It also prints how many downloads were attempted for each
    repository. This might aid in diagnosing slowness.
    
    This also changes mvn-quiet.sh to add logging that prints
    a timestamp. It also adds the -B flag to mvn, which causes
    maven to run in batch mode. This makes the output easier
    to parse, because maven omits special console formatting
    characters such as ^M (carriange return).
    
    This changes build-all-flag-combinations.sh to print the
    maven statistics after each part of the build and call the
    script to produce an m2 archive at the end.
    
    Change-Id: I043912f5fbc7cf24ee80b2855354656aa587ca9f
    Reviewed-on: http://gerrit.cloudera.org:8080/14562
    Reviewed-by: Laszlo Gaal <la...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 bin/jenkins/archive_m2_directory.sh        | 84 ++++++++++++++++++++++++++++++
 bin/jenkins/build-all-flag-combinations.sh | 23 ++++++++
 bin/jenkins/get_maven_statistics.sh        | 42 +++++++++++++++
 bin/mvn-quiet.sh                           |  4 +-
 impala-parent/pom.xml                      |  3 ++
 5 files changed, 155 insertions(+), 1 deletion(-)

diff --git a/bin/jenkins/archive_m2_directory.sh b/bin/jenkins/archive_m2_directory.sh
new file mode 100755
index 0000000..328122c
--- /dev/null
+++ b/bin/jenkins/archive_m2_directory.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# This script creates a tarball of the ~/.m2 directory that is
+# intended to be used to prepopulate the .m2 directory. Since
+# we want CDH/CDP dependencies to come from impala.cdh.repo or
+# impala.cdp.repo, it parses the maven log to detect artifacts
+# that come from those repositories and does not include them
+# in the tarball. The script does not make any changes to the
+# ~/.m2 directory.
+#
+# The script takes two arguments: the maven log and the filename
+# for the output tarball.
+# archive_m2_directory.sh [maven_log] [output_tarball]
+#
+# There are two requirements for the maven log:
+# 1. It needs to be produced by a recent maven version (such as 3.5.4 installed by
+# bin/bootstrap_system.sh). This is required because recent maven outputs line like:
+# [INFO] Downloading from {repo}: {url}
+# [INFO] Downloaded from {repo}: {url}
+# Older maven (e.g. 3.3.9) omits the "from {repo}" part.
+# 2. Maven needs to run in batch mode (-B). This keeps the output from using special
+# characters to format things on the console (e.g. carriage return ^M).
+set -euo pipefail
+
+MVN_LOG=$1
+OUTFILE=$2
+
+TMP_DIR=$(mktemp -d)
+ARCHIVE_DIR=${TMP_DIR}/repository
+
+function onexit {
+  echo "$0: Cleaning up temporary directory"
+  rm -rf ${TMP_DIR}
+}
+trap onexit EXIT
+
+# Make our own copy of .m2 in a temp directory
+mkdir -p ${ARCHIVE_DIR}
+cp -R ~/.m2/repository/* ${ARCHIVE_DIR}
+
+# We want to remove artifacts/directories that belong to impala.cdh.repo or
+# impala.cdp.repo. This greps the maven log to get all the URLs that we
+# downloaded. It knows the form of the URL for impala.cdh.repo and
+# impala.cdp.repo (i.e. that the actual directory structure starts after
+# '/maven/'. Extract out the directory/filename downloaded.
+cat "${MVN_LOG}" | grep "Downloaded from" | sed 's|.* Downloaded from ||' \
+    | grep -e 'impala.cdp.repo' -e 'impala.cdh.repo' | cut -d' ' -f2 \
+    | sed 's|.*/maven/||' > ${TMP_DIR}/cdp_cdh_artifacts.txt
+
+# Simplify it to a list of directories that contain artifacts from impala.cdp.repo
+# and impala.cdh.repo. SNAPSHOT artifacts like maven-metadata.xml can result in
+# multiple artifacts in the directory, so it is useful to get rid of the whole
+# directory.
+cat ${TMP_DIR}/cdp_cdh_artifacts.txt | xargs dirname | sort \
+    | uniq > ${TMP_DIR}/cdp_cdh_directories.txt
+
+# Remove the directories from our copy of m2
+for dir in $(cat ${TMP_DIR}/cdp_cdh_directories.txt); do
+    DIRECTORY=${ARCHIVE_DIR}/${dir}
+    echo "Removing directory ${DIRECTORY}"
+    rm -rf "${DIRECTORY}"
+done
+
+# Tar it up
+tar -zcf ${OUTFILE} -C ${TMP_DIR} repository
+
+# Note: The exit callback handles cleanup of the temp directory.
diff --git a/bin/jenkins/build-all-flag-combinations.sh b/bin/jenkins/build-all-flag-combinations.sh
index cbbc7c4..6c2edd5 100755
--- a/bin/jenkins/build-all-flag-combinations.sh
+++ b/bin/jenkins/build-all-flag-combinations.sh
@@ -48,6 +48,15 @@ CONFIGS=(
 
 FAILED=""
 
+TMP_DIR=$(mktemp -d)
+function onexit {
+  echo "$0: Cleaning up temporary directory"
+  rm -rf ${TMP_DIR}
+}
+trap onexit EXIT
+
+mkdir -p ${TMP_DIR}
+
 for CONFIG in "${CONFIGS[@]}"; do
   CONFIG2=${CONFIG/-use_cdp_hive/}
   if [[ "$CONFIG" != "$CONFIG2" ]]; then
@@ -73,11 +82,25 @@ for CONFIG in "${CONFIGS[@]}"; do
     FAILED="${FAILED}:${DESCRIPTION}"
   fi
   ccache -s
+  bin/jenkins/get_maven_statistics.sh logs/mvn/mvn.log
+
+  # Keep each maven log from each round of the build
+  cp logs/mvn/mvn.log "${TMP_DIR}/mvn.$(date +%s.%N).log"
+  # Append the maven log to the accumulated maven log
+  cat logs/mvn/mvn.log >> "${TMP_DIR}/mvn_accumulated.log"
 done
 
+# Restore the maven logs (these don't interfere with existing mvn.log)
+cp ${TMP_DIR}/mvn* logs/mvn
+
 if [[ "$FAILED" != "" ]]
 then
   echo "The following builds failed:"
   echo "$FAILED"
   exit 1
 fi
+
+# Make a tarball of the .m2 directory
+bin/jenkins/archive_m2_directory.sh logs/mvn/mvn_accumulated.log logs/m2_archive.tar.gz
+
+# Note: The exit callback handles cleanup of the temp directory.
diff --git a/bin/jenkins/get_maven_statistics.sh b/bin/jenkins/get_maven_statistics.sh
new file mode 100755
index 0000000..aff473f
--- /dev/null
+++ b/bin/jenkins/get_maven_statistics.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Expect maven log as the first argument.
+# There are two requirements for the maven log:
+# 1. It needs to be produced by a recent maven version (such as 3.5.4 installed by
+# bin/bootstrap_system.sh). This is required because recent maven outputs line like:
+# [INFO] Downloading from {repo}: {url}
+# [INFO] Downloaded from {repo}: {url}
+# Older maven (e.g. 3.3.9) omits the "from {repo}" part.
+# 2. Maven needs to run in batch mode (-B). This keeps the output from using special
+# characters to format things on the console (e.g. carriage return ^M).
+set -euo pipefail
+
+MVN_LOG=$1
+
+# Dump how many artifacts were downloaded from each repo
+echo "Number of artifacts downloaded from each repo:"
+cat "${MVN_LOG}" | grep "Downloaded from" | sed 's|.* Downloaded from ||' \
+    | cut -d: -f1 | sort | uniq -c
+
+# Dump how many artifacts we tried to download from each repo
+echo
+echo "Number of download attempts (successful or unsuccessful) per repo:"
+cat "${MVN_LOG}" | grep "Downloading from" | sed 's|.* Downloading from ||' \
+    | cut -d: -f1 | sort | uniq -c
diff --git a/bin/mvn-quiet.sh b/bin/mvn-quiet.sh
index beeaeb9..d8e3711 100755
--- a/bin/mvn-quiet.sh
+++ b/bin/mvn-quiet.sh
@@ -31,7 +31,9 @@ Directory $(pwd)
 ========================================================================
 EOF
 
-if ! mvn $IMPALA_MAVEN_OPTIONS "$@" | \
+# Always use maven's batch mode (-B), as it produces output that is easier to parse.
+# Also, add a timestamp to the maven output.
+if ! mvn -B $IMPALA_MAVEN_OPTIONS "$@" | awk '{ print strftime("[%H:%M:%S]"), $0 }' | \
   tee -a "$LOG_FILE" | grep -E -e WARNING -e ERROR -e SUCCESS -e FAILURE -e Test; then
   echo "mvn $IMPALA_MAVEN_OPTIONS $@ exited with code $?"
   exit 1
diff --git a/impala-parent/pom.xml b/impala-parent/pom.xml
index 7c54e4b..adb8158 100644
--- a/impala-parent/pom.xml
+++ b/impala-parent/pom.xml
@@ -120,6 +120,9 @@ under the License.
       <snapshots>
         <enabled>false</enabled>
       </snapshots>
+      <releases>
+        <enabled>false</enabled>
+      </releases>
     </repository>
     <repository>
       <!--