You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@allura.apache.org by tv...@apache.org on 2013/07/30 17:31:38 UTC

git commit: [#6458] Add google-code wiki page extraction

Updated Branches:
  refs/heads/tv/6458 [created] ca66e4791


[#6458] Add google-code wiki page extraction

Signed-off-by: Tim Van Steenburgh <tv...@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/ca66e479
Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/ca66e479
Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/ca66e479

Branch: refs/heads/tv/6458
Commit: ca66e47919f17100fb92bdaef272a1c422443998
Parents: 61d0ca5
Author: Tim Van Steenburgh <tv...@gmail.com>
Authored: Tue Jul 30 14:39:04 2013 +0000
Committer: Tim Van Steenburgh <tv...@gmail.com>
Committed: Tue Jul 30 14:39:04 2013 +0000

----------------------------------------------------------------------
 ForgeImporters/forgeimporters/google/__init__.py  | 18 ++++++++++++++----
 .../forgeimporters/tests/google/test_extractor.py |  9 +++++++++
 2 files changed, 23 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/ca66e479/ForgeImporters/forgeimporters/google/__init__.py
----------------------------------------------------------------------
diff --git a/ForgeImporters/forgeimporters/google/__init__.py b/ForgeImporters/forgeimporters/google/__init__.py
index 17d724f..2d90ab3 100644
--- a/ForgeImporters/forgeimporters/google/__init__.py
+++ b/ForgeImporters/forgeimporters/google/__init__.py
@@ -34,11 +34,13 @@ from allura import model as M
 log = logging.getLogger(__name__)
 
 class GoogleCodeProjectExtractor(object):
+    BASE_URL = 'http://code.google.com'
     RE_REPO_TYPE = re.compile(r'(svn|hg|git)')
 
     PAGE_MAP = {
-            'project_info': 'http://code.google.com/p/%s/',
-            'source_browse': 'http://code.google.com/p/%s/source/browse/',
+            'project_info': BASE_URL + '/p/%s/',
+            'source_browse': BASE_URL + '/p/%s/source/browse/',
+            'wiki_index': BASE_URL + '/p/%s/w/list',
         }
 
     LICENSE_MAP = defaultdict(lambda:'Other/Proprietary License', {
@@ -57,8 +59,8 @@ class GoogleCodeProjectExtractor(object):
     DEFAULT_ICON = 'http://www.gstatic.com/codesite/ph/images/defaultlogo.png'
 
     def __init__(self, project, page='project_info'):
-        gc_project_name = project.get_tool_data('google-code', 'project_name')
-        self.url = self.PAGE_MAP[page] % urllib.quote(gc_project_name)
+        self.gc_project_name = project.get_tool_data('google-code', 'project_name')
+        self.url = self.PAGE_MAP[page] % urllib.quote(self.gc_project_name)
         self.project = project
         self.page = BeautifulSoup(urllib2.urlopen(self.url))
 
@@ -93,3 +95,11 @@ class GoogleCodeProjectExtractor(object):
             return re_match.group(0)
         else:
             raise Exception("Unknown repo type: {0}".format(repo_type.text))
+
+    def get_wiki_pages(self):
+        RE_WIKI_PAGE_URL = r'^/p/{0}/wiki/.*$'.format(self.gc_project_name)
+        seen = set()
+        for a in self.page.find(id="resultstable").findAll("a"):
+            if re.match(RE_WIKI_PAGE_URL, a['href']) and a['href'] not in seen:
+                yield (a.text, self.BASE_URL + a['href'])
+                seen.add(a['href'])

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/ca66e479/ForgeImporters/forgeimporters/tests/google/test_extractor.py
----------------------------------------------------------------------
diff --git a/ForgeImporters/forgeimporters/tests/google/test_extractor.py b/ForgeImporters/forgeimporters/tests/google/test_extractor.py
index e346f1e..a681d82 100644
--- a/ForgeImporters/forgeimporters/tests/google/test_extractor.py
+++ b/ForgeImporters/forgeimporters/tests/google/test_extractor.py
@@ -118,3 +118,12 @@ class TestGoogleCodeProjectExtractor(TestCase):
         with self.assertRaises(Exception) as cm:
             extractor.get_repo_type()
         self.assertEqual(str(cm.exception), "Unknown repo type: cvs")
+
+    def test_get_wiki_pages(self):
+        extractor = self._make_extractor('''
+        <div id="resultstable">
+            <a href="#">Link that's not a wiki page</a>
+            <a href="/p/my-project/wiki/PageOne">PageOne</a>
+        </div>''')
+        self.assertEqual(list(extractor.get_wiki_pages()), [
+            ('PageOne', 'http://code.google.com/p/my-project/wiki/PageOne')])