You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@labs.apache.org by ad...@apache.org on 2014/06/24 08:40:21 UTC

svn commit: r1605014 - in /labs/panopticon/pan-utils: src/asf/data/releases.py tests/test_releases.py

Author: adc
Date: Tue Jun 24 06:40:21 2014
New Revision: 1605014

URL: http://svn.apache.org/r1605014
Log:
Fixed various bugs

- some resource pages have links that wander off the original site
- somtimes things recursively arrive back to the original URL

Modified:
    labs/panopticon/pan-utils/src/asf/data/releases.py
    labs/panopticon/pan-utils/tests/test_releases.py

Modified: labs/panopticon/pan-utils/src/asf/data/releases.py
URL: http://svn.apache.org/viewvc/labs/panopticon/pan-utils/src/asf/data/releases.py?rev=1605014&r1=1605013&r2=1605014&view=diff
==============================================================================
--- labs/panopticon/pan-utils/src/asf/data/releases.py (original)
+++ labs/panopticon/pan-utils/src/asf/data/releases.py Tue Jun 24 06:40:21 2014
@@ -51,24 +51,42 @@ def scrape_release_url(release_url, igno
 
     ignore = (ignore or set()) | set(['../'])
 
+    visited = set()
+
     def _scrape(scanning_url):
+        if scanning_url in visited:
+            return {}
+        else:
+            visited.add(scanning_url)
+
         log.debug('scraping %s', scanning_url)
         request = restkit.request(scanning_url, follow_redirect=True)
         soup = bs4.BeautifulSoup(request.body_string())
-        resources = collections.defaultdict(dict)
 
+        # let's put the protocol in canonical form so that it's easily compared
+        canonical_url = scanning_url.replace('https://', 'http://', count=1)
+
+        resources = collections.defaultdict(dict)
         for link in soup.find_all('a'):
             href = link.get('href')
-            if href in ignore:
+            if not href or href in ignore:
                 log.debug('ignored href %s in %s', href, scanning_url)
                 continue
 
             full_href = urlparse.urljoin(scanning_url, href)
+            # put the protocol in canonical form so that it's easily compared
+            if not full_href.replace('https://', 'http://', count=1).startswith(canonical_url):
+                # if we're wandering off the original release URL then we've
+                # accidentally hit a link that goes off-site
+                log.debug('ignored off-site href %s in %s', href, scanning_url)
+                continue
+
             if href.endswith('/'):
                 resources.update(_scrape(full_href))
             else:
-                text = link.get_text()
-                resources[full_href[original_len:-len(text) - 1].strip('/')][text] = full_href
+                resource_name = link.get_text()
+                resource_path = full_href[original_len:-len(resource_name) - 1].strip('/')
+                resources[resource_path][resource_name] = full_href
 
         return resources
 
@@ -105,7 +123,10 @@ def verify_hash(resource_path, hash_path
             hasher.update(buf)
             buf = f.read(BLOCK_SIZE)
 
-    return reported_hash.strip().lower() == hasher.hexdigest().strip().lower()
+    result = reported_hash.strip().lower() == hasher.hexdigest().strip().lower()
+    if not result:
+        log.warning('Bad hash "%s" != "%s"', reported_hash.strip().lower(), hasher.hexdigest().strip().lower())
+    return result
 
 
 def verify_signature(resource_file, signature_file, gpg):

Modified: labs/panopticon/pan-utils/tests/test_releases.py
URL: http://svn.apache.org/viewvc/labs/panopticon/pan-utils/tests/test_releases.py?rev=1605014&r1=1605013&r2=1605014&view=diff
==============================================================================
--- labs/panopticon/pan-utils/tests/test_releases.py (original)
+++ labs/panopticon/pan-utils/tests/test_releases.py Tue Jun 24 06:40:21 2014
@@ -22,7 +22,7 @@ from asf.utils.test import ensure_gpg
 
 @ensure_gpg
 def test_scrape_release_url():
-    assert releases.verify_hashes('https://repository.apache.org/content/repositories/orgapachesirona-1000//', 'https://dist.apache.org/repos/dist/release/incubator/sirona/KEYS')
+    assert releases.verify_hashes('https://repository.apache.org/content/repositories/orgapachesirona-1000/', 'https://dist.apache.org/repos/dist/release/incubator/sirona/KEYS')
     assert releases.verify_hashes('https://dist.apache.org/repos/dist/dev/incubator/sirona/0.2-incubating/', 'https://dist.apache.org/repos/dist/release/incubator/sirona/KEYS')
 
     assert releases.verify_hashes('https://repository.apache.org/content/repositories/orgapachemrql-1001/', 'http://www.apache.org/dist/incubator/mrql/KEYS')



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@labs.apache.org
For additional commands, e-mail: commits-help@labs.apache.org