You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2021/03/31 03:28:01 UTC

[impala] 02/04: IMPALA-10606: Simplify impala-python virtualenv bootstrapping

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 1142c7b58eeb26cf727327c0e42329184cfd52bf
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Wed Mar 24 14:33:37 2021 -0700

    IMPALA-10606: Simplify impala-python virtualenv bootstrapping
    
    Bootstrapping the impala-python virtualenv requires multiple
    rounds of pip installs with different sets of requirements.
    This consolidates the requirements.txt, stage2-requirements.txt,
    and compiled-requirements.txt into a single requirements.txt.
    This will make it easier to upgrade python packages.
    
    This also splits out setuptools into its own
    setuptools-requirements.txt. Setuptools is used during the
    pip install for several of the dependencies. Recent versions
    of setuptools do not support Python 2, but some of the install
    tools (like easy_install) don't know how to pick a version
    of setuptools that works with Python 2. Splitting it out to its
    own requirements file lets us pin the version.
    
    To make review easier, this does not change any of the versions
    of the dependencies. It also leaves the stage2-requirements.txt
    and compiled-requirements.txt split out in separate sections
    of requirements.txt. These will later be turned into a single
    alphabetical list.
    
    Testing:
     - Tested impala-python locally
     - Ran GVO
    
    Change-Id: I8e920e5a257f1e0613065685078624a50d59bf2e
    Reviewed-on: http://gerrit.cloudera.org:8080/17226
    Reviewed-by: Joe McDonnell <jo...@cloudera.com>
    Tested-by: Joe McDonnell <jo...@cloudera.com>
---
 infra/python/bootstrap_virtualenv.py               | 107 +++++++++------------
 infra/python/deps/pip_download.py                  |   5 +-
 infra/python/deps/requirements.txt                 |  40 +++++++-
 ...equirements.txt => setuptools-requirements.txt} |  17 +---
 infra/python/deps/stage2-requirements.txt          |  40 --------
 5 files changed, 88 insertions(+), 121 deletions(-)

diff --git a/infra/python/bootstrap_virtualenv.py b/infra/python/bootstrap_virtualenv.py
index e85a1e4..604177d 100644
--- a/infra/python/bootstrap_virtualenv.py
+++ b/infra/python/bootstrap_virtualenv.py
@@ -18,21 +18,19 @@
 # This module will create a python virtual env and install external dependencies. If the
 # virtualenv already exists and it contains all the expected packages, nothing is done.
 #
-# A multi-step bootstrapping process is required to build and install all of the
-# dependencies:
-# 1. install basic non-C/C++ packages into the virtualenv
-# 1b. install packages that depend on step 1 but cannot be installed together with their
-#     dependencies
-# 2. use the virtualenv Python to bootstrap the toolchain
-# 3. use toolchain gcc to build C/C++ packages
-# 4. build the kudu-python package with toolchain gcc and Cython
+# It is expected that bootstrap_toolchain.py already ran prior to running this
+# (and thus the toolchain GCC compiler is in place).
 #
-# Every time this script is run, it completes as many of the bootstrapping steps as
-# possible with the available dependencies.
+# The virtualenv creation process involves multiple rounds of pip installs, but
+# this script expects to complete all rounds in a single invocation. The steps are:
+# 1. Install setuptools and its depenencies. These are used by the setup.py scripts
+#    that run during pip install.
+# 2. Install most packages (including ones that require C/C++ compilation)
+# 3. Install Kudu package (which uses the toolchain GCC and the installed Cython)
+# 4. Install ADLS packages if applicable
 #
-# This module can be run with python >= 2.4 but python >= 2.6 must be installed on the
-# system. If the default 'python' command refers to < 2.6, python 2.6 will be used
-# instead.
+# This module can be run with python >= 2.7. It makes no guarantees about usage on
+# python < 2.7.
 
 from __future__ import print_function
 import glob
@@ -44,7 +42,6 @@ import subprocess
 import sys
 import tarfile
 import tempfile
-import textwrap
 import urllib
 from bootstrap_toolchain import ToolchainPackage
 
@@ -57,25 +54,27 @@ GCC_VERSION = os.environ["IMPALA_GCC_VERSION"]
 DEPS_DIR = os.path.join(os.path.dirname(__file__), "deps")
 ENV_DIR = os.path.join(os.path.dirname(__file__), "env-gcc{0}".format(GCC_VERSION))
 
-# Requirements file with packages we need for our build and tests.
-REQS_PATH = os.path.join(DEPS_DIR, "requirements.txt")
-
-# Second stage of requirements which cannot be installed together with their dependencies
-# in requirements.txt.
-REQS2_PATH = os.path.join(DEPS_DIR, "stage2-requirements.txt")
+# Setuptools requirements file. Setuptools is required during pip install for
+# some packages. Newer setuptools dropped python 2 support, and some python
+# install tools don't understand that they need to get a version that works
+# with the current python version. This can cause them to try to install the newer
+# setuptools that won't work on python 2. Doing this as a separate step makes it
+# easy to pin the version of setuptools to a Python 2 compatible version.
+SETUPTOOLS_REQS_PATH = os.path.join(DEPS_DIR, "setuptools-requirements.txt")
 
-# Requirements for the next bootstrapping step that builds compiled requirements
-# with toolchain gcc.
-COMPILED_REQS_PATH = os.path.join(DEPS_DIR, "compiled-requirements.txt")
+# Requirements file with packages we need for our build and tests, which depends
+# on setuptools being installed by the setuptools requirements step.
+REQS_PATH = os.path.join(DEPS_DIR, "requirements.txt")
 
 # Requirements for the Kudu bootstrapping step, which depends on Cython being installed
-# by the compiled requirements step.
+# by the requirements step.
 KUDU_REQS_PATH = os.path.join(DEPS_DIR, "kudu-requirements.txt")
 
 # Requirements for the ADLS test client step, which depends on Cffi (C Foreign Function
-# Interface) being installed by the compiled requirements step.
+# Interface) being installed by the requirements step.
 ADLS_REQS_PATH = os.path.join(DEPS_DIR, "adls-requirements.txt")
 
+
 def delete_virtualenv_if_exist():
   if os.path.exists(ENV_DIR):
     shutil.rmtree(ENV_DIR)
@@ -108,6 +107,7 @@ def exec_cmd(args, **kwargs):
         % (args, output))
   return output
 
+
 def use_ccache():
   '''Returns true if ccache is available and should be used'''
   if 'DISABLE_CCACHE' in os.environ: return False
@@ -117,6 +117,7 @@ def use_ccache():
   except:
     return False
 
+
 def select_cc():
   '''Return the C compiler command that should be used as a string or None if the
   compiler is not available '''
@@ -129,6 +130,7 @@ def select_cc():
   if use_ccache(): cc = "ccache %s" % cc
   return cc
 
+
 def exec_pip_install(args, cc="no-cc-available", env=None):
   '''Executes "pip install" with the provided command line arguments. If 'cc' is set,
   it is used as the C compiler. Otherwise compilation of C/C++ code is disabled by
@@ -218,48 +220,28 @@ def download_toolchain_python():
 
 
 def install_deps():
+  LOG.info("Installing setuptools into the virtualenv")
+  exec_pip_install(["-r", SETUPTOOLS_REQS_PATH])
+  cc = select_cc()
+  if cc is None:
+    raise Exception("CC not available")
+  env = dict(os.environ)
   LOG.info("Installing packages into the virtualenv")
-  exec_pip_install(["-r", REQS_PATH])
+  exec_pip_install(["-r", REQS_PATH], cc=cc, env=env)
   mark_reqs_installed(REQS_PATH)
-  LOG.info("Installing stage 2 packages into the virtualenv")
-  exec_pip_install(["-r", REQS2_PATH])
-  mark_reqs_installed(REQS2_PATH)
+
 
 def have_toolchain():
   '''Return true if the Impala toolchain is available'''
   return "IMPALA_TOOLCHAIN_PACKAGES_HOME" in os.environ
 
+
 def toolchain_pkg_dir(pkg_name):
   '''Return the path to the toolchain package'''
   pkg_version = os.environ["IMPALA_" + pkg_name.upper() + "_VERSION"]
   return os.path.join(os.environ["IMPALA_TOOLCHAIN_PACKAGES_HOME"],
       pkg_name + "-" + pkg_version)
 
-def install_compiled_deps_if_possible():
-  '''Install dependencies that require compilation with toolchain GCC, if the toolchain
-  is available. Returns true if the deps are installed'''
-  if reqs_are_installed(COMPILED_REQS_PATH):
-    LOG.debug("Skipping compiled deps: matching compiled-installed-requirements.txt found")
-    return True
-  cc = select_cc()
-  if cc is None:
-    LOG.debug("Skipping compiled deps: cc not available yet")
-    return False
-
-  env = dict(os.environ)
-
-  # Compilation of pycrypto fails on CentOS 5 with newer GCC versions because of a
-  # problem with inline declarations in older libc headers. Setting -fgnu89-inline is a
-  # workaround.
-  distro_version = ''.join(exec_cmd(["lsb_release", "-irs"]).lower().split())
-  print(distro_version)
-  if distro_version.startswith("centos5."):
-    env["CFLAGS"] = "-fgnu89-inline"
-
-  LOG.info("Installing compiled requirements into the virtualenv")
-  exec_pip_install(["-r", COMPILED_REQS_PATH], cc=cc, env=env)
-  mark_reqs_installed(COMPILED_REQS_PATH)
-  return True
 
 def install_adls_deps():
   # The ADLS dependencies require that the OS is at least CentOS 6.7 or above,
@@ -275,9 +257,10 @@ def install_adls_deps():
     exec_pip_install(["-r", ADLS_REQS_PATH], cc=cc)
     mark_reqs_installed(ADLS_REQS_PATH)
 
+
 def install_kudu_client_if_possible():
   '''Installs the Kudu python module if possible, which depends on the toolchain and
-  the compiled requirements in compiled-requirements.txt. If the toolchain isn't
+  the compiled requirements in requirements.txt. If the toolchain isn't
   available, nothing will be done.'''
   if reqs_are_installed(KUDU_REQS_PATH):
     LOG.debug("Skipping Kudu: matching kudu-installed-requirements.txt found")
@@ -344,12 +327,14 @@ def error_if_kudu_client_not_found(install_dir):
         return
   raise Exception("%s not found at %s" % (kudu_client_lib, lib_dir))
 
+
 def mark_reqs_installed(reqs_path):
-  '''Mark that the requirements from the given file are installed by copying it into the root
-  directory of the virtualenv.'''
+  '''Mark that the requirements from the given file are installed by copying it into
+  the root directory of the virtualenv.'''
   installed_reqs_path = os.path.join(ENV_DIR, os.path.basename(reqs_path))
   shutil.copyfile(reqs_path, installed_reqs_path)
 
+
 def reqs_are_installed(reqs_path):
   '''Check if the requirements from the given file are installed in the virtualenv by
   looking for a matching requirements file in the root directory of the virtualenv.'''
@@ -370,8 +355,9 @@ def reqs_are_installed(reqs_path):
   finally:
     installed_reqs_file.close()
 
+
 def setup_virtualenv_if_not_exists():
-  if not (reqs_are_installed(REQS_PATH) and reqs_are_installed(REQS2_PATH)):
+  if not (reqs_are_installed(REQS_PATH)):
     delete_virtualenv_if_exist()
     create_virtualenv()
     install_deps()
@@ -405,6 +391,5 @@ if __name__ == "__main__":
 
   # Complete as many bootstrap steps as possible (see file comment for the steps).
   setup_virtualenv_if_not_exists()
-  if install_compiled_deps_if_possible():
-    install_kudu_client_if_possible()
-    install_adls_deps()
+  install_kudu_client_if_possible()
+  install_adls_deps()
diff --git a/infra/python/deps/pip_download.py b/infra/python/deps/pip_download.py
index e322517..b289ce1 100755
--- a/infra/python/deps/pip_download.py
+++ b/infra/python/deps/pip_download.py
@@ -37,9 +37,8 @@ NUM_DOWNLOAD_ATTEMPTS = 8
 PYPI_MIRROR = os.environ.get('PYPI_MIRROR', 'https://pypi.python.org')
 
 # The requirement files that list all of the required packages and versions.
-REQUIREMENTS_FILES = ['requirements.txt', 'stage2-requirements.txt',
-                      'compiled-requirements.txt', 'kudu-requirements.txt',
-                      'adls-requirements.txt']
+REQUIREMENTS_FILES = ['requirements.txt', 'setuptools-requirements.txt',
+                      'kudu-requirements.txt', 'adls-requirements.txt']
 
 
 def check_digest(filename, algorithm, expected_digest):
diff --git a/infra/python/deps/requirements.txt b/infra/python/deps/requirements.txt
index ab1d758..f26f394 100644
--- a/infra/python/deps/requirements.txt
+++ b/infra/python/deps/requirements.txt
@@ -48,9 +48,6 @@ python-magic == 0.4.11
 # attempting to install pywebhdfs (https://github.com/pywebhdfs/pywebhdfs/issues/52).
 # pywebhdfs itself will be installed in stage 2.
   pbr == 3.1.1
-# Newer versions of setuptools don't support Python 2.6
-setuptools == 36.8.0
-setuptools-scm == 1.15.4
 sh == 1.11
 six == 1.14.0
 sqlparse == 0.3.1
@@ -61,3 +58,40 @@ ipython == 1.2.1
   apipkg == 1.4
 
 virtualenv == 13.1.0
+
+#### Formerly stage2-requirements.txt
+
+# Requires setuptools-scm
+pytest == 2.9.2
+  py == 1.4.32
+  pytest-forked == 0.2
+  pytest-random == 0.02
+  pytest-runner == 4.2
+  pytest-xdist == 1.17.1
+  pytest-timeout == 1.2.1
+hdfs == 2.0.2
+  docopt == 0.6.2
+  execnet == 1.4.0
+
+# Requires pbr
+pywebhdfs == 0.3.2
+
+requests == 2.20.0
+   chardet == 3.0.4
+   idna == 2.8
+   urllib3 == 1.21.1
+   certifi == 2020.12.5
+
+#### Formerly compiled-requirements.txt
+
+argparse == 1.4.0
+impyla == 0.17a1
+  bitarray == 1.2.1
+  sasl == 0.2.1
+  # six == 1.14.0 (specified above)
+  thrift_sasl == 0.4.2
+psutil == 5.6.3
+# Required for Kudu:
+  Cython == 0.23.4
+  numpy == 1.10.4
+  pytz == 2018.3
diff --git a/infra/python/deps/compiled-requirements.txt b/infra/python/deps/setuptools-requirements.txt
similarity index 68%
rename from infra/python/deps/compiled-requirements.txt
rename to infra/python/deps/setuptools-requirements.txt
index 0914bc3..38ef911 100644
--- a/infra/python/deps/compiled-requirements.txt
+++ b/infra/python/deps/setuptools-requirements.txt
@@ -15,17 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# Requirements that require a C/C++ compiler to build, which may not be available until
-# after the toolchain is bootstrapped. Installed after requirements.txt
-
-argparse == 1.4.0
-impyla == 0.17a1
-  bitarray == 1.2.1
-  sasl == 0.2.1
-  six == 1.14.0
-  thrift_sasl == 0.4.2
-psutil == 5.6.3
-# Required for Kudu:
-  Cython == 0.23.4
-  numpy == 1.10.4
-  pytz == 2018.3
+# Newer versions of setuptools don't support Python 2.6
+setuptools == 36.8.0
+setuptools-scm == 1.15.4
diff --git a/infra/python/deps/stage2-requirements.txt b/infra/python/deps/stage2-requirements.txt
deleted file mode 100644
index b5bc44b..0000000
--- a/infra/python/deps/stage2-requirements.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# This file contains packages that have dependencies in requirements.txt and that have to
-# be installed in a separate invocation of pip.
-
-# Requires setuptools-scm
-pytest == 2.9.2
-  py == 1.4.32
-  pytest-forked == 0.2
-  pytest-random == 0.02
-  pytest-runner == 4.2
-  pytest-xdist == 1.17.1
-  pytest-timeout == 1.2.1
-hdfs == 2.0.2
-  docopt == 0.6.2
-  execnet == 1.4.0
-
-# Requires pbr
-pywebhdfs == 0.3.2
-
-requests == 2.20.0
-   chardet == 3.0.4
-   idna == 2.8
-   urllib3 == 1.21.1
-   certifi == 2020.12.5
\ No newline at end of file