You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@allura.apache.org by tv...@apache.org on 2013/07/30 17:31:38 UTC
git commit: [#6458] Add google-code wiki page extraction
Updated Branches:
refs/heads/tv/6458 [created] ca66e4791
[#6458] Add google-code wiki page extraction
Signed-off-by: Tim Van Steenburgh <tv...@gmail.com>
Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/ca66e479
Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/ca66e479
Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/ca66e479
Branch: refs/heads/tv/6458
Commit: ca66e47919f17100fb92bdaef272a1c422443998
Parents: 61d0ca5
Author: Tim Van Steenburgh <tv...@gmail.com>
Authored: Tue Jul 30 14:39:04 2013 +0000
Committer: Tim Van Steenburgh <tv...@gmail.com>
Committed: Tue Jul 30 14:39:04 2013 +0000
----------------------------------------------------------------------
ForgeImporters/forgeimporters/google/__init__.py | 18 ++++++++++++++----
.../forgeimporters/tests/google/test_extractor.py | 9 +++++++++
2 files changed, 23 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/ca66e479/ForgeImporters/forgeimporters/google/__init__.py
----------------------------------------------------------------------
diff --git a/ForgeImporters/forgeimporters/google/__init__.py b/ForgeImporters/forgeimporters/google/__init__.py
index 17d724f..2d90ab3 100644
--- a/ForgeImporters/forgeimporters/google/__init__.py
+++ b/ForgeImporters/forgeimporters/google/__init__.py
@@ -34,11 +34,13 @@ from allura import model as M
log = logging.getLogger(__name__)
class GoogleCodeProjectExtractor(object):
+ BASE_URL = 'http://code.google.com'
RE_REPO_TYPE = re.compile(r'(svn|hg|git)')
PAGE_MAP = {
- 'project_info': 'http://code.google.com/p/%s/',
- 'source_browse': 'http://code.google.com/p/%s/source/browse/',
+ 'project_info': BASE_URL + '/p/%s/',
+ 'source_browse': BASE_URL + '/p/%s/source/browse/',
+ 'wiki_index': BASE_URL + '/p/%s/w/list',
}
LICENSE_MAP = defaultdict(lambda:'Other/Proprietary License', {
@@ -57,8 +59,8 @@ class GoogleCodeProjectExtractor(object):
DEFAULT_ICON = 'http://www.gstatic.com/codesite/ph/images/defaultlogo.png'
def __init__(self, project, page='project_info'):
- gc_project_name = project.get_tool_data('google-code', 'project_name')
- self.url = self.PAGE_MAP[page] % urllib.quote(gc_project_name)
+ self.gc_project_name = project.get_tool_data('google-code', 'project_name')
+ self.url = self.PAGE_MAP[page] % urllib.quote(self.gc_project_name)
self.project = project
self.page = BeautifulSoup(urllib2.urlopen(self.url))
@@ -93,3 +95,11 @@ class GoogleCodeProjectExtractor(object):
return re_match.group(0)
else:
raise Exception("Unknown repo type: {0}".format(repo_type.text))
+
+ def get_wiki_pages(self):
+ RE_WIKI_PAGE_URL = r'^/p/{0}/wiki/.*$'.format(self.gc_project_name)
+ seen = set()
+ for a in self.page.find(id="resultstable").findAll("a"):
+ if re.match(RE_WIKI_PAGE_URL, a['href']) and a['href'] not in seen:
+ yield (a.text, self.BASE_URL + a['href'])
+ seen.add(a['href'])
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/ca66e479/ForgeImporters/forgeimporters/tests/google/test_extractor.py
----------------------------------------------------------------------
diff --git a/ForgeImporters/forgeimporters/tests/google/test_extractor.py b/ForgeImporters/forgeimporters/tests/google/test_extractor.py
index e346f1e..a681d82 100644
--- a/ForgeImporters/forgeimporters/tests/google/test_extractor.py
+++ b/ForgeImporters/forgeimporters/tests/google/test_extractor.py
@@ -118,3 +118,12 @@ class TestGoogleCodeProjectExtractor(TestCase):
with self.assertRaises(Exception) as cm:
extractor.get_repo_type()
self.assertEqual(str(cm.exception), "Unknown repo type: cvs")
+
+ def test_get_wiki_pages(self):
+ extractor = self._make_extractor('''
+ <div id="resultstable">
+ <a href="#">Link that's not a wiki page</a>
+ <a href="/p/my-project/wiki/PageOne">PageOne</a>
+ </div>''')
+ self.assertEqual(list(extractor.get_wiki_pages()), [
+ ('PageOne', 'http://code.google.com/p/my-project/wiki/PageOne')])