You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@allura.apache.org by jo...@apache.org on 2013/08/26 19:20:09 UTC
[5/6] git commit: [#6531] Refactored get_page to accept parser
argument
[#6531] Refactored get_page to accept parser argument
Signed-off-by: Cory Johns <cj...@slashdotmedia.com>
Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/ff5af166
Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/ff5af166
Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/ff5af166
Branch: refs/heads/cj/6531
Commit: ff5af166fae48eaa3f0f8c7a2f969b0764d5e4e8
Parents: 2d5cf6c
Author: Cory Johns <cj...@slashdotmedia.com>
Authored: Fri Aug 23 20:51:59 2013 +0000
Committer: Cory Johns <cj...@slashdotmedia.com>
Committed: Mon Aug 26 17:19:57 2013 +0000
----------------------------------------------------------------------
ForgeImporters/forgeimporters/base.py | 13 +++-
.../forgeimporters/google/__init__.py | 65 +++++++++++---------
.../tests/google/functional/test_tracker.py | 2 +-
.../tests/google/test_extractor.py | 29 +++++----
4 files changed, 65 insertions(+), 44 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/ff5af166/ForgeImporters/forgeimporters/base.py
----------------------------------------------------------------------
diff --git a/ForgeImporters/forgeimporters/base.py b/ForgeImporters/forgeimporters/base.py
index 3cf6774..19a3d70 100644
--- a/ForgeImporters/forgeimporters/base.py
+++ b/ForgeImporters/forgeimporters/base.py
@@ -86,12 +86,16 @@ class ProjectExtractor(object):
req.add_header('User-Agent', 'Allura Data Importer (http://sf.net/p/allura)')
return h.urlopen(req, retries=retries, codes=codes)
- def get_page(self, page_name_or_url, **kw):
+ def get_page(self, page_name_or_url, parser=None, **kw):
"""Return a Beautiful soup object for the given page name or url.
If a page name is provided, the associated url is looked up in
:attr:`PAGE_MAP`.
+ If provided, the class or callable passed in :param:`parser` will be
+ used to transform the result of the `urlopen` before returning it.
+ Otherwise, the class's :meth:`parse_page` will be used.
+
Results are cached so that subsequent calls for the same page name or
url will return the cached result rather than making another HTTP
request.
@@ -104,8 +108,10 @@ class ProjectExtractor(object):
if self.url in self._page_cache:
self.page = self._page_cache[self.url]
else:
+ if parser is None:
+ parser = self.parse_page
self.page = self._page_cache[self.url] = \
- self.parse_page(self.urlopen(self.url))
+ parser(self.urlopen(self.url))
return self.page
def get_page_url(self, page_name, **kw):
@@ -125,7 +131,8 @@ class ProjectExtractor(object):
the html.
Subclasses can override to change the behavior or handle other types
- of content (like JSON).
+ of content (like JSON). The parser can also be overridden via the
+ `parser` parameter to :meth:`get_page`
:param page: A file-like object return from :meth:`urlopen`
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/ff5af166/ForgeImporters/forgeimporters/google/__init__.py
----------------------------------------------------------------------
diff --git a/ForgeImporters/forgeimporters/google/__init__.py b/ForgeImporters/forgeimporters/google/__init__.py
index 29e5011..849f924 100644
--- a/ForgeImporters/forgeimporters/google/__init__.py
+++ b/ForgeImporters/forgeimporters/google/__init__.py
@@ -51,6 +51,24 @@ def _as_text(node, chunks=None):
_as_text(n, chunks)
return ''.join(chunks)
+def csv_parser(page):
+ lines = page.readlines()
+ if not lines:
+ return []
+ # skip CSV header
+ lines = lines[1:]
+ # skip "next page here" info footer
+ if not lines[-1].startswith('"'):
+ lines.pop()
+ # remove CSV wrapping (quotes, commas, newlines)
+ return [line.strip('",\n') for line in lines]
+
+def stringio_parser(page):
+ return {
+ 'content-type': page.info()['content-type'],
+ 'data': StringIO(page.read()),
+ }
+
class GoogleCodeProjectExtractor(ProjectExtractor):
BASE_URL = 'http://code.google.com'
@@ -88,11 +106,9 @@ class GoogleCodeProjectExtractor(ProjectExtractor):
if icon_url == self.DEFAULT_ICON:
return
icon_name = urllib.unquote(urlparse(icon_url).path).split('/')[-1]
- fp_ish = self.urlopen(icon_url)
- fp = StringIO(fp_ish.read())
+ icon = File(icon_url, icon_name)
M.ProjectFile.save_image(
- icon_name, fp,
- fp_ish.info()['content-type'].split(';')[0], # strip off charset=x extra param,
+ icon_name, icon.file, icon.type,
square=True, thumbnail_size=(48,48),
thumbnail_meta={'project_id': project._id, 'category': 'icon'})
@@ -115,16 +131,6 @@ class GoogleCodeProjectExtractor(ProjectExtractor):
raise Exception("Unknown repo type: {0}".format(repo_type.text))
@classmethod
- def _get_issue_ids_page(cls, project_name, start):
- url = cls.PAGE_MAP['issues_csv'].format(project_name=project_name, start=start)
- with closing(cls.urlopen(url)) as fp:
- lines = fp.readlines()[1:] # skip CSV header
- if not lines[-1].startswith('"'):
- lines.pop() # skip "next page here" info footer
- issue_ids = [line.strip('",\n') for line in lines]
- return issue_ids
-
- @classmethod
def iter_issues(cls, project_name):
"""
Iterate over all issues for a project,
@@ -133,13 +139,14 @@ class GoogleCodeProjectExtractor(ProjectExtractor):
start = 0
limit = 100
- while True:
- issue_ids = cls._get_issue_ids_page(project_name, start)
- if len(issue_ids) <= 0:
+ extractor = cls(project_name, 'issues_csv', parser=csv_parser, start=start)
+ while extractor.page:
+ if len(extractor.page) <= 0:
return
- for issue_id in issue_ids:
+ for issue_id in extractor.page:
yield (int(issue_id), cls(project_name, 'issue', issue_id=issue_id))
start += limit
+ extractor.get_page('issues_csv', parser=csv_parser, start=start)
def get_issue_summary(self):
text = self.page.find(id='issueheader').findAll('td', limit=2)[1].span.string.strip()
@@ -256,14 +263,16 @@ class Comment(object):
)
return text
-class Attachment(object):
- def __init__(self, tag):
- self.url = urljoin(GoogleCodeProjectExtractor.BASE_URL, tag.get('href'))
- self.filename = parse_qs(urlparse(self.url).query)['name'][0]
- self.type = None
+class File(object):
+ def __init__(self, url, filename):
+ extractor = GoogleCodeProjectExtractor(None, url, parser=stringio_parser)
+ self.url = url
+ self.filename = filename
+ self.type = extractor.page['content-type'].split(';')[0]
+ self.file = extractor.page['data']
- @property
- def file(self):
- fp_ish = GoogleCodeProjectExtractor(None).urlopen(self.url)
- fp = StringIO(fp_ish.read())
- return fp
+class Attachment(File):
+ def __init__(self, tag):
+ url = urljoin(GoogleCodeProjectExtractor.BASE_URL, tag.get('href'))
+ filename = parse_qs(urlparse(url).query)['name'][0]
+ super(Attachment, self).__init__(url, filename)
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/ff5af166/ForgeImporters/forgeimporters/tests/google/functional/test_tracker.py
----------------------------------------------------------------------
diff --git a/ForgeImporters/forgeimporters/tests/google/functional/test_tracker.py b/ForgeImporters/forgeimporters/tests/google/functional/test_tracker.py
index 184f7fd..2e5f542 100644
--- a/ForgeImporters/forgeimporters/tests/google/functional/test_tracker.py
+++ b/ForgeImporters/forgeimporters/tests/google/functional/test_tracker.py
@@ -51,7 +51,7 @@ class TestGCTrackerImporter(TestCase):
with mock.patch.object(base.h, 'urlopen') as urlopen,\
mock.patch.object(google.tracker, 'GoogleCodeProjectExtractor') as GPE,\
mock.patch('forgetracker.tasks.update_bin_counts') as ubc:
- urlopen.side_effect = lambda req, **kw: mock.Mock(read=req.get_full_url)
+ urlopen.side_effect = lambda req, **kw: mock.Mock(read=req.get_full_url, info=lambda:{'content-type': 'text/plain'})
GPE.iter_issues.return_value = [(issue_id, issue)]
gti = google.tracker.GoogleCodeTrackerImporter()
gti.import_tool(self.project, self.user, 'test-issue-project', mount_point='test-issue')
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/ff5af166/ForgeImporters/forgeimporters/tests/google/test_extractor.py
----------------------------------------------------------------------
diff --git a/ForgeImporters/forgeimporters/tests/google/test_extractor.py b/ForgeImporters/forgeimporters/tests/google/test_extractor.py
index d5a9f22..668662e 100644
--- a/ForgeImporters/forgeimporters/tests/google/test_extractor.py
+++ b/ForgeImporters/forgeimporters/tests/google/test_extractor.py
@@ -64,6 +64,10 @@ class TestGoogleCodeProjectExtractor(TestCase):
page = extractor.get_page('source_browse')
self.assertEqual(2, self.urlopen.call_count)
self.assertEqual(page, extractor._page_cache['http://code.google.com/p/my-project/source/browse/'])
+ parser = mock.Mock(return_value='parsed')
+ page = extractor.get_page('url', parser=parser)
+ self.assertEqual(page, 'parsed')
+ self.assertEqual(page, extractor._page_cache['url'])
def test_get_page_url(self):
extractor = google.GoogleCodeProjectExtractor('my-project')
@@ -79,22 +83,20 @@ class TestGoogleCodeProjectExtractor(TestCase):
extractor.page.find.assert_called_once_with(itemprop='description')
self.assertEqual(self.project.short_description, 'My Super Project')
- @mock.patch.object(google, 'StringIO')
+ @mock.patch.object(google, 'File')
@mock.patch.object(google, 'M')
- def test_get_icon(self, M, StringIO):
- self.urlopen.return_value.info.return_value = {'content-type': 'image/png'}
+ def test_get_icon(self, M, File):
+ File.return_value.type = 'image/png'
+ File.return_value.file = 'data'
extractor = google.GoogleCodeProjectExtractor('my-project', 'project_info')
extractor.page.find.return_value.get.return_value = 'http://example.com/foo/bar/my-logo.png'
- self.urlopen.reset_mock()
extractor.get_icon(self.project)
extractor.page.find.assert_called_once_with(itemprop='image')
- self.urlopen.assert_called_once_with('http://example.com/foo/bar/my-logo.png')
- self.urlopen.return_value.info.assert_called_once_with()
- StringIO.assert_called_once_with(self.urlopen.return_value.read.return_value)
+ File.assert_called_once_with('http://example.com/foo/bar/my-logo.png', 'my-logo.png')
M.ProjectFile.save_image.assert_called_once_with(
- 'my-logo.png', StringIO.return_value, 'image/png', square=True,
+ 'my-logo.png', 'data', 'image/png', square=True,
thumbnail_size=(48,48), thumbnail_meta={
'project_id': self.project._id, 'category': 'icon'})
@@ -209,19 +211,22 @@ class TestGoogleCodeProjectExtractor(TestCase):
'OpSys-OSX',
])
- def test_get_issue_attachments(self):
+ @mock.patch.object(google, 'StringIO')
+ def test_get_issue_attachments(self, StringIO):
+ self.urlopen.return_value.info.return_value = {'content-type': 'text/plain; foo'}
test_issue = open(pkg_resources.resource_filename('forgeimporters', 'tests/data/google/test-issue.html')).read()
gpe = self._make_extractor(test_issue)
attachments = gpe.get_issue_attachments()
self.assertEqual(len(attachments), 2)
self.assertEqual(attachments[0].filename, 'at1.txt')
self.assertEqual(attachments[0].url, 'http://allura-google-importer.googlecode.com/issues/attachment?aid=70000000&name=at1.txt&token=3REU1M3JUUMt0rJUg7ldcELt6LA%3A1376059941255')
- self.assertIsNone(attachments[0].type)
+ self.assertEqual(attachments[0].type, 'text/plain')
self.assertEqual(attachments[1].filename, 'at2.txt')
self.assertEqual(attachments[1].url, 'http://allura-google-importer.googlecode.com/issues/attachment?aid=70000001&name=at2.txt&token=C9Hn4s1-g38hlSggRGo65VZM1ys%3A1376059941255')
- self.assertIsNone(attachments[1].type)
+ self.assertEqual(attachments[1].type, 'text/plain')
- def test_iter_comments(self):
+ @mock.patch.object(google, 'StringIO')
+ def test_iter_comments(self, StringIO):
test_issue = open(pkg_resources.resource_filename('forgeimporters', 'tests/data/google/test-issue.html')).read()
gpe = self._make_extractor(test_issue)
comments = list(gpe.iter_comments())