You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@allura.apache.org by tv...@apache.org on 2013/08/07 15:36:58 UTC

[05/14] git commit: [#6458] Add google-code wiki page extraction

[#6458] Add google-code wiki page extraction

Signed-off-by: Tim Van Steenburgh <tv...@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/c192a843
Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/c192a843
Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/c192a843

Branch: refs/heads/tv/6480
Commit: c192a843ffb850eb4e7dba472a57cd852fec81ce
Parents: e3663fb
Author: Tim Van Steenburgh <tv...@gmail.com>
Authored: Tue Jul 30 14:39:04 2013 +0000
Committer: Tim Van Steenburgh <tv...@gmail.com>
Committed: Wed Aug 7 12:26:52 2013 +0000

----------------------------------------------------------------------
 ForgeImporters/forgeimporters/google/__init__.py   | 17 ++++++++++++++---
 .../forgeimporters/tests/google/test_extractor.py  |  9 +++++++++
 2 files changed, 23 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/c192a843/ForgeImporters/forgeimporters/google/__init__.py
----------------------------------------------------------------------
diff --git a/ForgeImporters/forgeimporters/google/__init__.py b/ForgeImporters/forgeimporters/google/__init__.py
index 57e384b..8c91fd3 100644
--- a/ForgeImporters/forgeimporters/google/__init__.py
+++ b/ForgeImporters/forgeimporters/google/__init__.py
@@ -34,11 +34,13 @@ from allura import model as M
 log = logging.getLogger(__name__)
 
 class GoogleCodeProjectExtractor(object):
+    BASE_URL = 'http://code.google.com'
     RE_REPO_TYPE = re.compile(r'(svn|hg|git)')
 
     PAGE_MAP = {
-            'project_info': 'http://code.google.com/p/%s/',
-            'source_browse': 'http://code.google.com/p/%s/source/browse/',
+            'project_info': BASE_URL + '/p/%s/',
+            'source_browse': BASE_URL + '/p/%s/source/browse/',
+            'wiki_index': BASE_URL + '/p/%s/w/list',
         }
 
     LICENSE_MAP = defaultdict(lambda:'Other/Proprietary License', {
@@ -58,7 +60,8 @@ class GoogleCodeProjectExtractor(object):
 
     def __init__(self, allura_project, gc_project_name, page):
         self.project = allura_project
-        self.url = self.PAGE_MAP[page] % urllib.quote(gc_project_name)
+        self.gc_project_name = gc_project_name
+        self.url = self.PAGE_MAP[page] % urllib.quote(self.gc_project_name)
         self.page = BeautifulSoup(urllib2.urlopen(self.url))
 
     def get_short_description(self):
@@ -92,3 +95,11 @@ class GoogleCodeProjectExtractor(object):
             return re_match.group(0)
         else:
             raise Exception("Unknown repo type: {0}".format(repo_type.text))
+
+    def get_wiki_pages(self):
+        RE_WIKI_PAGE_URL = r'^/p/{0}/wiki/.*$'.format(self.gc_project_name)
+        seen = set()
+        for a in self.page.find(id="resultstable").findAll("a"):
+            if re.match(RE_WIKI_PAGE_URL, a['href']) and a['href'] not in seen:
+                yield (a.text, self.BASE_URL + a['href'])
+                seen.add(a['href'])

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/c192a843/ForgeImporters/forgeimporters/tests/google/test_extractor.py
----------------------------------------------------------------------
diff --git a/ForgeImporters/forgeimporters/tests/google/test_extractor.py b/ForgeImporters/forgeimporters/tests/google/test_extractor.py
index 1a3a87c..250759f 100644
--- a/ForgeImporters/forgeimporters/tests/google/test_extractor.py
+++ b/ForgeImporters/forgeimporters/tests/google/test_extractor.py
@@ -118,3 +118,12 @@ class TestGoogleCodeProjectExtractor(TestCase):
         with self.assertRaises(Exception) as cm:
             extractor.get_repo_type()
         self.assertEqual(str(cm.exception), "Unknown repo type: cvs")
+
+    def test_get_wiki_pages(self):
+        extractor = self._make_extractor('''
+        <div id="resultstable">
+            <a href="#">Link that's not a wiki page</a>
+            <a href="/p/my-project/wiki/PageOne">PageOne</a>
+        </div>''')
+        self.assertEqual(list(extractor.get_wiki_pages()), [
+            ('PageOne', 'http://code.google.com/p/my-project/wiki/PageOne')])