You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/05/04 15:17:00 UTC

[3/8] impala git commit: Speed up Python dependencies.

Speed up Python dependencies.

This parallelizes downloading some Python libraries, giving a speedup of
$IMPALA_HOME/infra/python/deps/download_requirements.  I've seen this
take from 7-15 seconds before and from 2-5 seconds after.

I also checked that we always have at least Python 2.6 when
building Impala, so I was able to remove the try/except
handling in bootstrap_toolchain.

Change-Id: I7cbf622adb7d037f1a53c519402dcd8ae3c0fe30
Reviewed-on: http://gerrit.cloudera.org:8080/10234
Reviewed-by: Philip Zeyliger <ph...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/df3b5463
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/df3b5463
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/df3b5463

Branch: refs/heads/2.x
Commit: df3b5463b8d04881cce9da09a8942c047a53c6e5
Parents: ad33cf5
Author: Philip Zeyliger <ph...@cloudera.com>
Authored: Mon Apr 23 11:16:42 2018 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Thu May 3 19:59:25 2018 +0000

----------------------------------------------------------------------
 bin/bootstrap_toolchain.py        | 18 +++++----------
 infra/python/deps/pip_download.py | 42 +++++++++++++++++-----------------
 2 files changed, 27 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/df3b5463/bin/bootstrap_toolchain.py
----------------------------------------------------------------------
diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py
index f54bf04..6070350 100755
--- a/bin/bootstrap_toolchain.py
+++ b/bin/bootstrap_toolchain.py
@@ -35,6 +35,7 @@
 #
 #     python bootstrap_toolchain.py
 import logging
+import multiprocessing.pool
 import os
 import random
 import re
@@ -350,19 +351,12 @@ extern "C" void %s() {
 
 def execute_many(f, args):
   """
-  Executes f(a) for a in args. If possible, uses a threadpool
-  to execute in parallel. The pool uses the number of CPUs
-  in the system as the default size.
+  Executes f(a) for a in args using a threadpool to execute in parallel.
+  The pool uses the smaller of 4 and the number of CPUs in the system
+  as the pool size.
   """
-  pool = None
-  try:
-    import multiprocessing.pool
-    pool = multiprocessing.pool.ThreadPool(processes=min(multiprocessing.cpu_count(), 4))
-    return pool.map(f, args, 1)
-  except ImportError:
-    # multiprocessing was introduced in Python 2.6.
-    # For older Pythons (CentOS 5), degrade to single-threaded execution:
-    return [ f(a) for a in args ]
+  pool = multiprocessing.pool.ThreadPool(processes=min(multiprocessing.cpu_count(), 4))
+  return pool.map(f, args, 1)
 
 def download_cdh_components(toolchain_root, cdh_components):
   """Downloads and unpacks the CDH components into $CDH_COMPONENTS_HOME if not found."""

http://git-wip-us.apache.org/repos/asf/impala/blob/df3b5463/infra/python/deps/pip_download.py
----------------------------------------------------------------------
diff --git a/infra/python/deps/pip_download.py b/infra/python/deps/pip_download.py
index 0cce9e9..3e593c4 100755
--- a/infra/python/deps/pip_download.py
+++ b/infra/python/deps/pip_download.py
@@ -22,6 +22,7 @@
 # This script requires Python 2.6+.
 
 import hashlib
+import multiprocessing.pool
 import os
 import os.path
 import re
@@ -130,30 +131,29 @@ def main():
     download_package(pkg_name, pkg_version)
     return
 
+  pool = multiprocessing.pool.ThreadPool(processes=min(multiprocessing.cpu_count(), 4))
+  results = []
+
   for requirements_file in REQUIREMENTS_FILES:
     # If the package name and version are not specified in the command line arguments,
     # download the packages that in requirements.txt.
-    f = open(requirements_file, 'r')
-    try:
-      # requirements.txt follows the standard pip grammar.
-      for line in f:
-        # A hash symbol ("#") represents a comment that should be ignored.
-        hash_index = line.find('#')
-        if hash_index != -1:
-          line = line[:hash_index]
-        # A semi colon (";") specifies some additional condition for when the package
-        # should be installed (for example a specific OS). We can ignore this and download
-        # the package anyways because the installation script(bootstrap_virtualenv.py) can
-        # take it into account.
-        semi_colon_index = line.find(';')
-        if semi_colon_index != -1:
-          line = line[:semi_colon_index]
-        l = line.strip()
-        if len(l) > 0:
-          pkg_name, pkg_version = l.split('==')
-          download_package(pkg_name.strip(), pkg_version.strip())
-    finally:
-      f.close()
+    # requirements.txt follows the standard pip grammar.
+    for line in open(requirements_file):
+      # A hash symbol ("#") represents a comment that should be ignored.
+      line = line.split("#")[0]
+      # A semi colon (";") specifies some additional condition for when the package
+      # should be installed (for example a specific OS). We can ignore this and download
+      # the package anyways because the installation script(bootstrap_virtualenv.py) can
+      # take it into account.
+      l = line.split(";")[0].strip()
+      if not l:
+        continue
+      pkg_name, pkg_version = l.split('==')
+      results.append(pool.apply_async(
+        download_package, args=[pkg_name.strip(), pkg_version.strip()]))
+
+    for x in results:
+      x.get()
 
 if __name__ == '__main__':
   main()