You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by kw...@apache.org on 2016/10/25 05:53:22 UTC

[2/6] incubator-impala git commit: Add distcc infrastructure.

Add distcc infrastructure.

This has been working for several months, and it it was written mainly
by Casey Ching while he was at Cloudera working on Impala.

Change-Id: Ia4bc78ad46dda13e4533183195af632f46377cae
Reviewed-on: http://gerrit.cloudera.org:8080/4820
Reviewed-by: Jim Apple <jb...@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/0eaff805
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/0eaff805
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/0eaff805

Branch: refs/heads/master
Commit: 0eaff805e28dd4afac134f58b294732e414235ce
Parents: e0a3272
Author: Jim Apple <jb...@cloudera.com>
Authored: Sun Oct 23 14:54:08 2016 -0700
Committer: Internal Jenkins <cl...@gerrit.cloudera.org>
Committed: Tue Oct 25 01:15:50 2016 +0000

----------------------------------------------------------------------
 .gitignore               |   2 +-
 bin/distcc/.gitignore    |   1 +
 bin/distcc/README.md     | 106 ++++++++++++++++++++++++++++
 bin/distcc/distcc.sh     |  62 ++++++++++++++++
 bin/distcc/distcc_env.sh | 160 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 330 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0eaff805/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index 849ee61..e63f863 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,7 +13,7 @@ org.eclipse.jdt.ui.prefs
 load-*-generated.sql
 bin/version.info
 
-# Cloudera distcc options
+# distcc options
 .impala_compiler_opts
 
 pprof.out

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0eaff805/bin/distcc/.gitignore
----------------------------------------------------------------------
diff --git a/bin/distcc/.gitignore b/bin/distcc/.gitignore
new file mode 100644
index 0000000..ce71f70
--- /dev/null
+++ b/bin/distcc/.gitignore
@@ -0,0 +1 @@
+ld

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0eaff805/bin/distcc/README.md
----------------------------------------------------------------------
diff --git a/bin/distcc/README.md b/bin/distcc/README.md
new file mode 100644
index 0000000..2de7d4a
--- /dev/null
+++ b/bin/distcc/README.md
@@ -0,0 +1,106 @@
+# Distcc
+Distcc will speed up compilation by distributing compilation tasks to remote build
+machines. The scripts in this folder make using distcc easier.
+
+# Requirements
+
+The only requirement you should need to be aware of is, the scripts in this folder were
+only tested on Linux. If you are using OS X, things probably won't work out of the box.
+
+Assuming you are using Linux, if you use the scripts in this folder, there shouldn't be
+any other requirements. The distcc program should be installed and configured
+automatically. Still, understanding what is involved could be useful.
+
+**You shouldn't need to do any of this, this scripts do this for you.**
+
+1. Install distcc and ccache. Most Linux distros have these packages. The scripts will
+   install it if you have a yum or apt-get based system. Otherwise you should install
+   distcc and ccache yourself through whatever package manager your system uses.
+1. Configure the remote distcc hosts. Set your environment variable BUILD_FARM to
+   "host1/limit1,lzo host2/limit2,lzo" and so on.
+1. Your local compiler needs to be at the same path as it is on the remote build slaves.
+   That path is /opt/Impala-Toolchain/<gcc-version-folder>/bin/gcc. In other words, make
+   sure the Impala toolchain is available at /opt/Impala-Toolchain. That can be done
+   through a symlink, and that's what the scripts will attempt to setup.
+
+# Usage
+
+### First time
+1. Source bin/impala-config.sh in the Impala repo. Step #2 depends on this.
+
+        source "$IMPALA_HOME"/bin/impala-config.sh
+
+1. Source "distcc_env.sh" in this directory. The script will attempt to install distcc
+   if needed.
+
+        source "$IMPALA_AUX_TEST_HOME"/distcc/distcc_env.sh
+
+1. Run buildall.sh. The main purpose is to regenerate cmakefiles.
+
+        cd "$IMPALA_HOME"
+        ./buildall.sh -skiptests -so   # Do not use -noclean
+
+   You should notice that the build runs quite a bit faster.
+
+### Incremental builds
+At this point you no longer need to run the heavyweight buildall.sh. After editing files
+you can either
+```
+make -j$(distcc -j)
+```
+or
+```
+bin/make_impala.sh
+```
+
+### Switiching back to local compilation
+If you want to compile a very small change, a local build might be faster.
+```
+switch_compiler local
+```
+to switch back
+```
+switch_compiler distcc
+```
+
+### Switch to clang++
+Clang is faster and gives better error messages. This setup is still somewhat
+experimental.
+```
+switch_compiler clang
+```
+to switch back
+```
+switch_compiler gcc
+```
+
+### Second time
+If you open a new terminal and attempt to build with "make" or "bin/make_impala.sh",
+that will fail. To fix:
+```
+source "$IMPALA_HOME"/bin/impala-config.sh   # Skip if already done
+source "$IMPALA_HOME"/bin/distcc/distcc_env.sh
+```
+
+# Setting up a new distcc server
+
+1. Install "distccd" and "ccache".
+1. Configure distccd (edit /etc/sysconfig/distccd on a RHEL server) with the options
+   OPTIONS="--jobs 96 --allow YOUR.IP.ADDRESS.HERE --log-level=warn --nice=-15"
+   Where num jobs = 2x the number of cores on the machine. (2x is recommended by distcc.)
+1. Start distcc.
+1. Edit distcc_env.sh to include the new host.
+1. Install all gcc and binutils versions from the toolchain into /opt/Impala-Toolchain.
+1. ccache stores its cache in $HOME/.ccache. Assuming distcc is running as a non-root user
+   that has no $HOME, you must sudo mkdir /.ccache, then sudo chmod 777 /.ccache.
+1. If distcc runs as "nobody", sudo -u nobody ccache -M 25G. This sets the size of the
+   cache to 25GB. Adjust to your taste.
+
+# Misc notes
+
+1. "pump" doesn't work. Many compilation attempts time out say something like "Include
+   server did not process the request in 3.8 seconds". distcc tries to copy 3rd party
+   headers to the remote hosts and that may be the problem. If we could get the include
+   server to use the remote 3rd party headers that should help.
+1. Having a different local Linux OS on your development machine than on the distcc hosts
+   should be fine.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0eaff805/bin/distcc/distcc.sh
----------------------------------------------------------------------
diff --git a/bin/distcc/distcc.sh b/bin/distcc/distcc.sh
new file mode 100755
index 0000000..a1136e8
--- /dev/null
+++ b/bin/distcc/distcc.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if [[ -z "$DISTCC_HOSTS" || -z "$IMPALA_REAL_CXX_COMPILER" ]]; then
+  # This could be sourced here and the build would work but the parallelization (-j)
+  # should be wrong at this point and it's too late to fix.
+  DIR=$(dirname "$0")
+  echo "You must source '$DIR/distcc_env.sh' before attempting to build." 1>&2
+  exit 1
+fi
+
+TOOLCHAIN_DIR=/opt/Impala-Toolchain
+if [[ ! -d "$TOOLCHAIN_DIR" ]]; then
+  if [[ -n "$IMPALA_TOOLCHAIN" && -d "$IMPALA_TOOLCHAIN" ]]; then
+    if ! sudo -n -- ln -s "$IMPALA_TOOLCHAIN" "$TOOLCHAIN_DIR" &>/dev/null; then
+      echo The toolchain must be available at $TOOLCHAIN_DIR for distcc. \
+          Try running '"sudo ln -s $IMPALA_TOOLCHAIN $TOOLCHAIN_DIR"'. 1>&2
+      exit 1
+    fi
+  fi
+  echo "The toolchain wasn't found at '$TOOLCHAIN_DIR' and IMPALA_TOOLCHAIN is not set." \
+      Make sure the toolchain is available at $TOOLCHAIN_DIR and try again. 1>&2
+  exit 1
+fi
+
+CMD=
+CMD_POST_ARGS=
+if $IMPALA_USE_DISTCC; then
+  CMD="distcc ccache"
+fi
+
+GCC_ROOT="$TOOLCHAIN_DIR/gcc-$IMPALA_GCC_VERSION"
+case "$IMPALA_REAL_CXX_COMPILER" in
+  gcc) CMD+=" $GCC_ROOT/bin/g++";;
+  clang) # Assume the compilation options were setup for gcc, which would happen using
+         # default build options. Now some additional options need to be added for clang.
+         CMD+=" $TOOLCHAIN_DIR/llvm-$IMPALA_LLVM_ASAN_VERSION/bin/clang++"
+         CMD+=" --gcc-toolchain=$GCC_ROOT"
+         # -Wno-unused-local-typedef needs to go after -Wall
+         # -Wno-error is needed, clang generates more warnings than gcc.
+         CMD_POST_ARGS+=" -Wno-unused-local-typedef -Wno-error";;
+  *) echo "Unexpected IMPALA_REAL_CXX_COMPILER: '$IMPALA_REAL_CXX_COMPILER'" 1>&2
+     exit 1;;
+esac
+
+exec $CMD "$@" $CMD_POST_ARGS

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0eaff805/bin/distcc/distcc_env.sh
----------------------------------------------------------------------
diff --git a/bin/distcc/distcc_env.sh b/bin/distcc/distcc_env.sh
new file mode 100644
index 0000000..173cc18
--- /dev/null
+++ b/bin/distcc/distcc_env.sh
@@ -0,0 +1,160 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file is intended to be sourced by a shell (zsh and bash have been tested).
+
+if [[ -z $BUILD_FARM ]]
+then
+  echo "BUILD_FARM must be set to configure distcc" >&2
+  return 1
+fi
+
+if [[ ! -z $ZSH_NAME ]]; then
+  DISTCC_ENV_DIR=$(cd $(dirname ${(%):-%x}) && pwd)
+else
+  DISTCC_ENV_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+fi
+
+function cmd_exists {
+  which $1 &>/dev/null
+}
+
+INSTALLER=
+if cmd_exists apt-get; then
+  INSTALLER=apt-get
+elif cmd_exists yum; then
+  INSTALLER=yum
+fi
+
+if ! cmd_exists distcc; then
+  echo distcc command not found, attempting installation
+  if [[ -z $INSTALLER ]] || ! sudo $INSTALLER -y install distcc; then
+    echo Unable to automatically install distcc. You need to install it manually. 1>&2
+    return 1
+  fi
+fi
+
+# Install CCache if necessary.
+if ! cmd_exists ccache; then
+  echo "ccache command not found, attempting installation"
+  if [[ -z $INSTALLER ]] || ! sudo $INSTALLER -y install ccache; then
+    echo "Unable to automatically install ccache"
+    return 1
+  fi
+fi
+
+# Don't include localhost in the list. It is already the slowest part of the build because
+# it needs to do preprocessing and linking. There shouldn't be a need to add an extra
+# compilation worker.
+export DISTCC_HOSTS=
+DISTCC_HOSTS+=" --localslots=$(nproc)"
+DISTCC_HOSTS+=" --localslots_cpp=$(nproc)"
+DISTCC_HOSTS+=" --randomize"
+DISTCC_HOSTS+=" ${BUILD_FARM}"
+
+# The compiler that distcc.sh should use: gcc or clang.
+: ${IMPALA_REAL_CXX_COMPILER=}
+export IMPALA_REAL_CXX_COMPILER
+
+# Set to false to use local compilation instead of distcc.
+: ${IMPALA_USE_DISTCC=}
+export IMPALA_USE_DISTCC
+
+# Even after generating make files, some state about compiler options would only exist in
+# environment vars. Any such vars should be saved to this file so they can be restored.
+if [[ -z "$IMPALA_HOME" ]]; then
+  echo '$IMPALA_HOME must be set before sourcing this file.' 1>&2
+  return 1
+fi
+IMPALA_COMPILER_CONFIG_FILE="$IMPALA_HOME/.impala_compiler_opts"
+
+# Completely disable anything that could have been setup using this script and clean
+# the make files.
+function disable_distcc {
+  export IMPALA_CXX_COMPILER=default
+  export IMPALA_BUILD_THREADS=$(nproc)
+  save_compiler_opts
+  if ! clean_cmake_files; then
+    echo Failed to clean cmake files. 1>&2
+    return 1
+  fi
+  echo "distcc is not fully disabled, run 'buildall.sh' to complete the change." \
+    "Run 'enable_distcc' to enable."
+}
+
+function enable_distcc {
+  export IMPALA_CXX_COMPILER="$DISTCC_ENV_DIR"/distcc.sh
+  switch_compiler distcc gcc
+  export IMPALA_BUILD_THREADS=$(distcc -j)
+  if ! clean_cmake_files; then
+    echo Failed to clean cmake files. 1>&2
+    return 1
+  fi
+  echo "distcc is not fully enabled, run 'buildall.sh' to complete the change." \
+    "Run 'disable_distcc' or 'switch_compiler local' to disable."
+}
+
+# Cleans old CMake files, this is required when switching between distcc.sh and direct
+# compilation.
+function clean_cmake_files {
+  if [[ -z "$IMPALA_HOME" || ! -d "$IMPALA_HOME" ]]; then
+    echo IMPALA_HOME=$IMPALA_HOME is not valid. 1>&2
+    return 1
+  fi
+  # Copied from $IMPALA_HOME/bin/clean.sh.
+  find "$IMPALA_HOME" -iname '*cmake*' -not -name CMakeLists.txt \
+      -not -path '*cmake_modules*' \
+      -not -path '*thirdparty*'  | xargs rm -rf
+}
+
+function switch_compiler {
+  for ARG in "$@"; do
+    case "$ARG" in
+      "local")
+        IMPALA_USE_DISTCC=false
+        IMPALA_BUILD_THREADS=$(nproc);;
+      distcc)
+        IMPALA_USE_DISTCC=true
+        IMPALA_BUILD_THREADS=$(distcc -j);;
+      gcc) IMPALA_REAL_CXX_COMPILER=gcc;;
+      clang) IMPALA_REAL_CXX_COMPILER=clang;;
+      *) echo "Valid compiler options are:
+    'local'  - Don't use distcc and set -j value to $(nproc). (gcc/clang) remains unchanged.
+    'distcc' - Use distcc and set -j value to $(distcc -j). (gcc/clang) remains unchanged.
+    'gcc'    - Use gcc. (local/distcc remains unchanged).
+    'clang'  - Use clang. (local/distcc remains unchanged)." 2>&1
+        return 1;;
+    esac
+  done
+  save_compiler_opts
+}
+
+function save_compiler_opts {
+  rm -f "$IMPALA_COMPILER_CONFIG_FILE"
+  cat <<EOF > "$IMPALA_COMPILER_CONFIG_FILE"
+IMPALA_CXX_COMPILER=$IMPALA_CXX_COMPILER
+IMPALA_BUILD_THREADS=$IMPALA_BUILD_THREADS
+IMPALA_USE_DISTCC=$IMPALA_USE_DISTCC
+IMPALA_REAL_CXX_COMPILER=$IMPALA_REAL_CXX_COMPILER
+EOF
+}
+
+if [[ -e "$IMPALA_COMPILER_CONFIG_FILE" ]]; then
+  source "$IMPALA_COMPILER_CONFIG_FILE"
+else
+  enable_distcc
+fi