You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by mj...@apache.org on 2017/04/08 00:21:31 UTC
incubator-impala git commit: IMPALA-5181: Extract PYPI metadata from
a webpage
Repository: incubator-impala
Updated Branches:
refs/heads/master 9e7fb830f -> 4a79c9e7e
IMPALA-5181: Extract PYPI metadata from a webpage
There were some build failures due to a failure to download a JSON file
containing package metadata from PYPI. We need to switch to downloading
this from a PYPI mirror. In order to be able to download the metadata
from a PYPI mirror, we need be able to extract the data from a web page,
because PYPI mirrors do not always have a JSON interface.
We implement a regex based html parser in this patch. Also, we increase
the number of download attempts and randomly vary the amount of time
between each attempt.
Testing:
- Tested locally against PYPI and a PYPI mirror.
- Ran a private build that passed (which used a PYPI mirror).
Change-Id: If3845a0d5f568d4352e3cc4883596736974fd7de
Reviewed-on: http://gerrit.cloudera.org:8080/6579
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/4a79c9e7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/4a79c9e7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/4a79c9e7
Branch: refs/heads/master
Commit: 4a79c9e7e3928f919b5fb60bab4145ba886d6252
Parents: 9e7fb83
Author: Taras Bobrovytsky <tb...@cloudera.com>
Authored: Thu Mar 30 13:08:21 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Sat Apr 8 00:19:08 2017 +0000
----------------------------------------------------------------------
infra/python/deps/pip_download.py | 90 +++++++++++++++++++++-------------
1 file changed, 57 insertions(+), 33 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4a79c9e7/infra/python/deps/pip_download.py
----------------------------------------------------------------------
diff --git a/infra/python/deps/pip_download.py b/infra/python/deps/pip_download.py
index 85def64..bd54d30 100755
--- a/infra/python/deps/pip_download.py
+++ b/infra/python/deps/pip_download.py
@@ -24,18 +24,20 @@
import json
import os
import os.path
+import re
import sys
from hashlib import md5
+from random import randint
from time import sleep
from urllib import urlopen, URLopener
-NUM_TRIES = 3
+NUM_DOWNLOAD_ATTEMPTS = 8
-PYPI_MIRROR = os.environ.get("PYPI_MIRROR", "https://pypi.python.org")
+PYPI_MIRROR = os.environ.get('PYPI_MIRROR', 'https://pypi.python.org')
# The requirement files that list all of the required packages and versions.
REQUIREMENTS_FILES = ['requirements.txt', 'compiled-requirements.txt',
- 'kudu-requirements.txt']
+ 'kudu-requirements.txt']
def check_md5sum(filename, expected_md5):
actual_md5 = md5(open(filename).read()).hexdigest()
@@ -45,47 +47,69 @@ def retry(func):
'''Retry decorator.'''
def wrapper(*args, **kwargs):
- for _ in xrange(NUM_TRIES):
+ for try_num in xrange(NUM_DOWNLOAD_ATTEMPTS):
+ if try_num > 0:
+ sleep_len = randint(5, 10 * 2 ** try_num)
+ print 'Sleeping for {0} seconds before retrying'.format(sleep_len)
+ sleep(sleep_len)
try:
result = func(*args, **kwargs)
- if result: return result
+ if result:
+ return result
except Exception as e:
print e
- sleep(5)
- print "Download failed after several attempts."
+ print 'Download failed after several attempts.'
sys.exit(1)
return wrapper
+def get_package_info(pkg_name, pkg_version):
+ '''Returns the file name, path and md5 digest of the package.'''
+ # We store the matching result in the candidates list instead of returning right away
+ # to sort them and return the first value in alphabetical order. This ensures that the
+ # same result is always returned even if the ordering changed on the server.
+ candidates = []
+ url = '{0}/simple/{1}/'.format(PYPI_MIRROR, pkg_name)
+ print 'Getting package info from {0}'.format(url)
+ # The web page should be in PEP 503 format (https://www.python.org/dev/peps/pep-0503/).
+ # We parse the page with regex instead of an html parser because that requires
+ # downloading an extra package before running this script. Since the HTML is guaranteed
+ # to be formatted according to PEP 503, this is acceptable.
+ pkg_info = urlopen(url).read()
+ # We assume that the URL includes a hash and the hash function is md5. This not strictly
+ # required by PEP 503.
+ regex = r'<a href=\".*?packages/(.*?)#md5=(.*?)\".*?>(.*?)<\/a>'
+ for match in re.finditer(regex, pkg_info):
+ path = match.group(1)
+ md5_digest = match.group(2)
+ file_name = match.group(3)
+ # Make sure that we consider only non Wheel archives, because those are not supported.
+ if (file_name.endswith('-{0}.tar.gz'.format(pkg_version)) or
+ file_name.endswith('-{0}.tar.bz2'.format(pkg_version)) or
+ file_name.endswith('-{0}.zip'.format(pkg_version))):
+ candidates.append((file_name, path, md5_digest))
+ if not candidates:
+ print 'Could not find archive to download for {0} {1}'.format(pkg_name, pkg_version)
+ return (None, None, None)
+ return sorted(candidates)[0]
+
@retry
def download_package(pkg_name, pkg_version):
- '''Download the required package. Sometimes the download can be flaky, so we use the
- retry decorator.'''
- pkg_type = 'sdist' # Don't download wheel archives for now
- # This JSON endpoint is not provided by PyPI mirrors so we always need to get this
- # from pypi.python.org.
- pkg_info = json.loads(urlopen('https://pypi.python.org/pypi/%s/json' % pkg_name).read())
-
+ file_name, path, expected_md5 = get_package_info(pkg_name, pkg_version)
+ if not file_name:
+ return False
+ if os.path.isfile(file_name) and check_md5sum(file_name, expected_md5):
+ print 'File with matching md5sum already exists, skipping {0}'.format(file_name)
+ return True
downloader = URLopener()
- for pkg in pkg_info['releases'][pkg_version]:
- if pkg['packagetype'] == pkg_type:
- filename = pkg['filename']
- expected_md5 = pkg['md5_digest']
- if os.path.isfile(filename) and check_md5sum(filename, expected_md5):
- print "File with matching md5sum already exists, skipping %s" % filename
- return True
- pkg_url = "{0}/packages/{1}".format(PYPI_MIRROR, pkg['path'])
- print "Downloading %s from %s" % (filename, pkg_url)
- downloader.retrieve(pkg_url, filename)
- actual_md5 = md5(open(filename).read()).hexdigest()
- if check_md5sum(filename, expected_md5):
- return True
- else:
- print "MD5 mismatch in file %s." % filename
- return False
- print "Could not find archive to download for %s %s %s" % (
- pkg_name, pkg_version, pkg_type)
- sys.exit(1)
+ pkg_url = '{0}/packages/{1}'.format(PYPI_MIRROR, path)
+ print 'Downloading {0} from {1}'.format(file_name, pkg_url)
+ downloader.retrieve(pkg_url, file_name)
+ if check_md5sum(file_name, expected_md5):
+ return True
+ else:
+ print 'MD5 mismatch in file {0}.'.format(file_name)
+ return False
def main():
if len(sys.argv) > 1: