You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@labs.apache.org by ad...@apache.org on 2014/06/24 08:40:21 UTC
svn commit: r1605014 - in /labs/panopticon/pan-utils:
src/asf/data/releases.py tests/test_releases.py
Author: adc
Date: Tue Jun 24 06:40:21 2014
New Revision: 1605014
URL: http://svn.apache.org/r1605014
Log:
Fixed various bugs
- some resource pages have links that wander off the original site
- somtimes things recursively arrive back to the original URL
Modified:
labs/panopticon/pan-utils/src/asf/data/releases.py
labs/panopticon/pan-utils/tests/test_releases.py
Modified: labs/panopticon/pan-utils/src/asf/data/releases.py
URL: http://svn.apache.org/viewvc/labs/panopticon/pan-utils/src/asf/data/releases.py?rev=1605014&r1=1605013&r2=1605014&view=diff
==============================================================================
--- labs/panopticon/pan-utils/src/asf/data/releases.py (original)
+++ labs/panopticon/pan-utils/src/asf/data/releases.py Tue Jun 24 06:40:21 2014
@@ -51,24 +51,42 @@ def scrape_release_url(release_url, igno
ignore = (ignore or set()) | set(['../'])
+ visited = set()
+
def _scrape(scanning_url):
+ if scanning_url in visited:
+ return {}
+ else:
+ visited.add(scanning_url)
+
log.debug('scraping %s', scanning_url)
request = restkit.request(scanning_url, follow_redirect=True)
soup = bs4.BeautifulSoup(request.body_string())
- resources = collections.defaultdict(dict)
+ # let's put the protocol in canonical form so that it's easily compared
+ canonical_url = scanning_url.replace('https://', 'http://', count=1)
+
+ resources = collections.defaultdict(dict)
for link in soup.find_all('a'):
href = link.get('href')
- if href in ignore:
+ if not href or href in ignore:
log.debug('ignored href %s in %s', href, scanning_url)
continue
full_href = urlparse.urljoin(scanning_url, href)
+ # put the protocol in canonical form so that it's easily compared
+ if not full_href.replace('https://', 'http://', count=1).startswith(canonical_url):
+ # if we're wandering off the original release URL then we've
+ # accidentally hit a link that goes off-site
+ log.debug('ignored off-site href %s in %s', href, scanning_url)
+ continue
+
if href.endswith('/'):
resources.update(_scrape(full_href))
else:
- text = link.get_text()
- resources[full_href[original_len:-len(text) - 1].strip('/')][text] = full_href
+ resource_name = link.get_text()
+ resource_path = full_href[original_len:-len(resource_name) - 1].strip('/')
+ resources[resource_path][resource_name] = full_href
return resources
@@ -105,7 +123,10 @@ def verify_hash(resource_path, hash_path
hasher.update(buf)
buf = f.read(BLOCK_SIZE)
- return reported_hash.strip().lower() == hasher.hexdigest().strip().lower()
+ result = reported_hash.strip().lower() == hasher.hexdigest().strip().lower()
+ if not result:
+ log.warning('Bad hash "%s" != "%s"', reported_hash.strip().lower(), hasher.hexdigest().strip().lower())
+ return result
def verify_signature(resource_file, signature_file, gpg):
Modified: labs/panopticon/pan-utils/tests/test_releases.py
URL: http://svn.apache.org/viewvc/labs/panopticon/pan-utils/tests/test_releases.py?rev=1605014&r1=1605013&r2=1605014&view=diff
==============================================================================
--- labs/panopticon/pan-utils/tests/test_releases.py (original)
+++ labs/panopticon/pan-utils/tests/test_releases.py Tue Jun 24 06:40:21 2014
@@ -22,7 +22,7 @@ from asf.utils.test import ensure_gpg
@ensure_gpg
def test_scrape_release_url():
- assert releases.verify_hashes('https://repository.apache.org/content/repositories/orgapachesirona-1000//', 'https://dist.apache.org/repos/dist/release/incubator/sirona/KEYS')
+ assert releases.verify_hashes('https://repository.apache.org/content/repositories/orgapachesirona-1000/', 'https://dist.apache.org/repos/dist/release/incubator/sirona/KEYS')
assert releases.verify_hashes('https://dist.apache.org/repos/dist/dev/incubator/sirona/0.2-incubating/', 'https://dist.apache.org/repos/dist/release/incubator/sirona/KEYS')
assert releases.verify_hashes('https://repository.apache.org/content/repositories/orgapachemrql-1001/', 'http://www.apache.org/dist/incubator/mrql/KEYS')
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@labs.apache.org
For additional commands, e-mail: commits-help@labs.apache.org