You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/05/04 15:17:00 UTC
[3/8] impala git commit: Speed up Python dependencies.
Speed up Python dependencies.
This parallelizes downloading some Python libraries, giving a speedup of
$IMPALA_HOME/infra/python/deps/download_requirements. I've seen this
take from 7-15 seconds before and from 2-5 seconds after.
I also checked that we always have at least Python 2.6 when
building Impala, so I was able to remove the try/except
handling in bootstrap_toolchain.
Change-Id: I7cbf622adb7d037f1a53c519402dcd8ae3c0fe30
Reviewed-on: http://gerrit.cloudera.org:8080/10234
Reviewed-by: Philip Zeyliger <ph...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/df3b5463
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/df3b5463
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/df3b5463
Branch: refs/heads/2.x
Commit: df3b5463b8d04881cce9da09a8942c047a53c6e5
Parents: ad33cf5
Author: Philip Zeyliger <ph...@cloudera.com>
Authored: Mon Apr 23 11:16:42 2018 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Thu May 3 19:59:25 2018 +0000
----------------------------------------------------------------------
bin/bootstrap_toolchain.py | 18 +++++----------
infra/python/deps/pip_download.py | 42 +++++++++++++++++-----------------
2 files changed, 27 insertions(+), 33 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/impala/blob/df3b5463/bin/bootstrap_toolchain.py
----------------------------------------------------------------------
diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py
index f54bf04..6070350 100755
--- a/bin/bootstrap_toolchain.py
+++ b/bin/bootstrap_toolchain.py
@@ -35,6 +35,7 @@
#
# python bootstrap_toolchain.py
import logging
+import multiprocessing.pool
import os
import random
import re
@@ -350,19 +351,12 @@ extern "C" void %s() {
def execute_many(f, args):
"""
- Executes f(a) for a in args. If possible, uses a threadpool
- to execute in parallel. The pool uses the number of CPUs
- in the system as the default size.
+ Executes f(a) for a in args using a threadpool to execute in parallel.
+ The pool uses the smaller of 4 and the number of CPUs in the system
+ as the pool size.
"""
- pool = None
- try:
- import multiprocessing.pool
- pool = multiprocessing.pool.ThreadPool(processes=min(multiprocessing.cpu_count(), 4))
- return pool.map(f, args, 1)
- except ImportError:
- # multiprocessing was introduced in Python 2.6.
- # For older Pythons (CentOS 5), degrade to single-threaded execution:
- return [ f(a) for a in args ]
+ pool = multiprocessing.pool.ThreadPool(processes=min(multiprocessing.cpu_count(), 4))
+ return pool.map(f, args, 1)
def download_cdh_components(toolchain_root, cdh_components):
"""Downloads and unpacks the CDH components into $CDH_COMPONENTS_HOME if not found."""
http://git-wip-us.apache.org/repos/asf/impala/blob/df3b5463/infra/python/deps/pip_download.py
----------------------------------------------------------------------
diff --git a/infra/python/deps/pip_download.py b/infra/python/deps/pip_download.py
index 0cce9e9..3e593c4 100755
--- a/infra/python/deps/pip_download.py
+++ b/infra/python/deps/pip_download.py
@@ -22,6 +22,7 @@
# This script requires Python 2.6+.
import hashlib
+import multiprocessing.pool
import os
import os.path
import re
@@ -130,30 +131,29 @@ def main():
download_package(pkg_name, pkg_version)
return
+ pool = multiprocessing.pool.ThreadPool(processes=min(multiprocessing.cpu_count(), 4))
+ results = []
+
for requirements_file in REQUIREMENTS_FILES:
# If the package name and version are not specified in the command line arguments,
# download the packages that in requirements.txt.
- f = open(requirements_file, 'r')
- try:
- # requirements.txt follows the standard pip grammar.
- for line in f:
- # A hash symbol ("#") represents a comment that should be ignored.
- hash_index = line.find('#')
- if hash_index != -1:
- line = line[:hash_index]
- # A semi colon (";") specifies some additional condition for when the package
- # should be installed (for example a specific OS). We can ignore this and download
- # the package anyways because the installation script(bootstrap_virtualenv.py) can
- # take it into account.
- semi_colon_index = line.find(';')
- if semi_colon_index != -1:
- line = line[:semi_colon_index]
- l = line.strip()
- if len(l) > 0:
- pkg_name, pkg_version = l.split('==')
- download_package(pkg_name.strip(), pkg_version.strip())
- finally:
- f.close()
+ # requirements.txt follows the standard pip grammar.
+ for line in open(requirements_file):
+ # A hash symbol ("#") represents a comment that should be ignored.
+ line = line.split("#")[0]
+ # A semi colon (";") specifies some additional condition for when the package
+ # should be installed (for example a specific OS). We can ignore this and download
+ # the package anyways because the installation script(bootstrap_virtualenv.py) can
+ # take it into account.
+ l = line.split(";")[0].strip()
+ if not l:
+ continue
+ pkg_name, pkg_version = l.split('==')
+ results.append(pool.apply_async(
+ download_package, args=[pkg_name.strip(), pkg_version.strip()]))
+
+ for x in results:
+ x.get()
if __name__ == '__main__':
main()