You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@allura.apache.org by je...@apache.org on 2015/08/13 20:06:30 UTC

[03/50] allura git commit: [#7947] use beautifulsoup4 for correct identification of tricky tag situations during URL rewriting

[#7947] use beautifulsoup4 for correct identification of tricky tag situations during URL rewriting


Project: http://git-wip-us.apache.org/repos/asf/allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/allura/commit/e0e2f0c4
Tree: http://git-wip-us.apache.org/repos/asf/allura/tree/e0e2f0c4
Diff: http://git-wip-us.apache.org/repos/asf/allura/diff/e0e2f0c4

Branch: refs/heads/ib/7922
Commit: e0e2f0c4057a33256d59da72c73626848a1d6579
Parents: 556a99e
Author: Dave Brondsema <db...@slashdotmedia.com>
Authored: Mon Aug 3 18:45:36 2015 +0000
Committer: Dave Brondsema <db...@slashdotmedia.com>
Committed: Mon Aug 3 18:55:26 2015 +0000

----------------------------------------------------------------------
 Allura/allura/lib/markdown_extensions.py | 17 ++++++-----------
 Allura/allura/tests/test_globals.py      |  7 +++++++
 requirements.txt                         |  3 ++-
 3 files changed, 15 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/allura/blob/e0e2f0c4/Allura/allura/lib/markdown_extensions.py
----------------------------------------------------------------------
diff --git a/Allura/allura/lib/markdown_extensions.py b/Allura/allura/lib/markdown_extensions.py
index 83e8069..cbccf5d 100644
--- a/Allura/allura/lib/markdown_extensions.py
+++ b/Allura/allura/lib/markdown_extensions.py
@@ -20,7 +20,7 @@ import logging
 from urlparse import urljoin
 
 from tg import config
-from BeautifulSoup import BeautifulSoup
+from bs4 import BeautifulSoup
 import html5lib
 import html5lib.serializer
 import html5lib.filters.alphabeticalattributes
@@ -441,7 +441,8 @@ class RelativeLinkRewriter(markdown.postprocessors.Postprocessor):
         self._make_absolute = make_absolute
 
     def run(self, text):
-        soup = BeautifulSoup(text)
+        soup = BeautifulSoup(text, 'html5lib')  # 'html.parser' parser gives weird </li> behaviour with test_macro_members
+
         if self._make_absolute:
             rewrite = self._rewrite_abs
         else:
@@ -450,15 +451,9 @@ class RelativeLinkRewriter(markdown.postprocessors.Postprocessor):
             rewrite(link, 'href')
         for link in soup.findAll('img'):
             rewrite(link, 'src')
-        # BeautifulSoup always stores data in unicode,
-        # but when doing unicode(soup) it does some strange things
-        # like nesting html comments, e.g. returns <!--<!-- comment -->-->
-        # instead of <!-- comment -->.
-        # Converting soup object to string representation first,
-        # and then back to unicode avoids that.
-        # str() called on BeautifulSoup document always returns string
-        # encoded in utf-8, so this should always work.
-        return h.really_unicode(str(soup))
+
+        # html5lib parser adds html/head/body tags, so output <body> without its own tags
+        return unicode(soup.body)[len('<body>'):-len('</body>')]
 
     def _rewrite(self, tag, attr):
         val = tag.get(attr)

http://git-wip-us.apache.org/repos/asf/allura/blob/e0e2f0c4/Allura/allura/tests/test_globals.py
----------------------------------------------------------------------
diff --git a/Allura/allura/tests/test_globals.py b/Allura/allura/tests/test_globals.py
index ca04652..91565ae 100644
--- a/Allura/allura/tests/test_globals.py
+++ b/Allura/allura/tests/test_globals.py
@@ -480,6 +480,13 @@ def test_markdown_invalid_tagslash():
     r = g.markdown.convert('<div/onload><img src=x onerror=alert(document.cookie)>')
     assert_not_in('onerror', r)
 
+def test_markdown_invalid_script_in_link():
+    r = g.markdown.convert('[xss](http://"><a onmouseover=prompt(document.domain)>xss</a>)')
+    assert_equal('''<div class="markdown_content"><p><a class="" href='http://"&gt;&lt;a%20onmouseover=prompt(document.domain)&gt;xss&lt;/a&gt;' rel="nofollow">xss</a></p></div>''', r)
+
+def test_markdown_invalid_script_in_link2():
+    r = g.markdown.convert('[xss](http://"><img src=x onerror=alert(document.cookie)>)')
+    assert_equal('''<div class="markdown_content"><p><a class="" href='http://"&gt;&lt;img%20src=x%20onerror=alert(document.cookie)&gt;' rel="nofollow">xss</a></p></div>''', r)
 
 @td.with_wiki
 def test_macro_include():

http://git-wip-us.apache.org/repos/asf/allura/blob/e0e2f0c4/requirements.txt
----------------------------------------------------------------------
diff --git a/requirements.txt b/requirements.txt
index 77581e2..d327584 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 ActivityStream==0.2.0
 BeautifulSoup==3.2.0
+BeautifulSoup4==4.4.0
 Beaker==1.6.4
 chardet==1.0.1
 colander==0.9.3
@@ -81,4 +82,4 @@ q==2.3
 WebError==0.10.3
 -e git://github.com/brondsem/sphinx-argparse.git#egg=sphinx-argparse   # pending merge requests
 sphinx-rtd-theme==0.1.6
-sphinxcontrib-programoutput==0.8
\ No newline at end of file
+sphinxcontrib-programoutput==0.8