You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@allura.apache.org by br...@apache.org on 2018/07/31 19:14:23 UTC
[2/2] allura git commit: [#8221] upgrade html5lib

[#8221] upgrade html5lib


Project: http://git-wip-us.apache.org/repos/asf/allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/allura/commit/e67a58af
Tree: http://git-wip-us.apache.org/repos/asf/allura/tree/e67a58af
Diff: http://git-wip-us.apache.org/repos/asf/allura/diff/e67a58af

Branch: refs/heads/db/8221
Commit: e67a58af802a882e9979816316d19d46c378156d
Parents: 7a35db4
Author: Dave Brondsema <da...@brondsema.net>
Authored: Tue Jul 31 15:14:16 2018 -0400
Committer: Dave Brondsema <da...@brondsema.net>
Committed: Tue Jul 31 15:14:16 2018 -0400

----------------------------------------------------------------------
 Allura/allura/lib/markdown_extensions.py | 21 +++++---
 Allura/allura/lib/utils.py               | 76 ++++++++++++++++-----------
 Allura/allura/tests/test_markdown.py     |  8 +--
 Allura/allura/tests/test_utils.py        | 17 ++++--
 requirements.txt                         |  6 ++-
 5 files changed, 79 insertions(+), 49 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/allura/blob/e67a58af/Allura/allura/lib/markdown_extensions.py
----------------------------------------------------------------------
diff --git a/Allura/allura/lib/markdown_extensions.py b/Allura/allura/lib/markdown_extensions.py
index c292d66..d87687b 100644
--- a/Allura/allura/lib/markdown_extensions.py
+++ b/Allura/allura/lib/markdown_extensions.py
@@ -29,7 +29,7 @@ import markdown
 from . import macro
 from . import helpers as h
 from allura import model as M
-from allura.lib.utils import ForgeHTMLSanitizer
+from allura.lib.utils import ForgeHTMLSanitizerFilter
 
 log = logging.getLogger(__name__)
 
@@ -472,13 +472,18 @@ class RelativeLinkRewriter(markdown.postprocessors.Postprocessor):
 class HTMLSanitizer(markdown.postprocessors.Postprocessor):
 
     def run(self, text):
-        parser = html5lib.HTMLParser(tokenizer=ForgeHTMLSanitizer)
-        parsed = parser.parse(text)
-        serializer = html5lib.serializer.HTMLSerializer()
-        walker = html5lib.getTreeWalker("etree")
-        stream = html5lib.filters.alphabeticalattributes.Filter(walker(parsed))
-        out = ''.join(serializer.serialize(stream))
-        return out
+        parsed = html5lib.parseFragment(text)
+
+        # if we didn't have to customize our sanitization, could just do:
+        # return html5lib.serialize(parsed, sanitize=True)
+
+        # instead we do the same steps as that function,
+        # but add our ForgeHTMLSanitizerFilter instead of sanitize=True which would use the standard one
+        TreeWalker = html5lib.treewalkers.getTreeWalker("etree")
+        walker = TreeWalker(parsed)
+        walker = ForgeHTMLSanitizerFilter(walker)  # this is our custom step
+        s = html5lib.serializer.HTMLSerializer()
+        return s.render(walker)
 
 
 class AutolinkPattern(markdown.inlinepatterns.Pattern):

http://git-wip-us.apache.org/repos/asf/allura/blob/e67a58af/Allura/allura/lib/utils.py
----------------------------------------------------------------------
diff --git a/Allura/allura/lib/utils.py b/Allura/allura/lib/utils.py
index 1ec7f6a..f40d0a4 100644
--- a/Allura/allura/lib/utils.py
+++ b/Allura/allura/lib/utils.py
@@ -15,17 +15,12 @@
 #       specific language governing permissions and limitations
 #       under the License.
 from contextlib import contextmanager
-
 import time
 import string
 import hashlib
 import binascii
 import logging.handlers
 import codecs
-
-from html5lib.constants import tokenTypes
-
-from ming.odm import session
 import os.path
 import datetime
 import random
@@ -47,11 +42,11 @@ from webhelpers.html import literal
 from webob import exc
 from pygments.formatters import HtmlFormatter
 from setproctitle import getproctitle
-import html5lib.sanitizer
-
+import html5lib.filters.sanitizer
 from ew import jinja2_ew as ew
 from ming.utils import LazyProperty
 from ming.odm.odmsession import ODMCursor
+from ming.odm import session
 
 MARKDOWN_EXTENSIONS = ['.markdown', '.mdown', '.mkdn', '.mkd', '.md']
 
@@ -570,40 +565,57 @@ def serve_file(fp, filename, content_type, last_modified=None,
         return iter(lambda: fp.read(block_size), '')
 
 
-class ForgeHTMLSanitizer(html5lib.sanitizer.HTMLSanitizer):
-    # remove some elements from the sanitizer whitelist
-    # <form> and <input> could be used for a social engineering attack to construct a form
-    # others are just unexpected and confusing, and have no need to be used in markdown
-    _form_elements = ('button', 'datalist', 'fieldset', 'form', 'input', 'label', 'legend', 'meter', 'optgroup',
-                      'option', 'output', 'progress', 'select', 'textarea')
-    _forge_acceptable_elements = [e for e in html5lib.sanitizer.HTMLSanitizer.acceptable_elements
-                                  if e not in (_form_elements)]
-    allowed_elements = _forge_acceptable_elements \
-                       + html5lib.sanitizer.HTMLSanitizer.mathml_elements \
-                       + html5lib.sanitizer.HTMLSanitizer.svg_elements
+class ForgeHTMLSanitizerFilter(html5lib.filters.sanitizer.Filter):
 
-    # srcset is used in our own project_list/project_summary widgets which are used as macros so go through markdown
-    allowed_attributes = html5lib.sanitizer.HTMLSanitizer.allowed_attributes + ['srcset']
-
-    valid_iframe_srcs = ('https://www.youtube.com/embed/', 'https://www.gittip.com/')
-
-    _prev_token_was_ok_iframe = False
+    def __init__(self, *args, **kwargs):
+        super(ForgeHTMLSanitizerFilter, self).__init__(*args, **kwargs)
+        # remove some elements from the sanitizer whitelist
+        # <form> and <input> could be used for a social engineering attack to construct a form
+        # others are just unexpected and confusing, and have no need to be used in markdown
+        ns_html = html5lib.constants.namespaces['html']
+        _form_elements = {(ns_html, 'button'),
+                          (ns_html, 'datalist'),
+                          (ns_html, 'fieldset'),
+                          (ns_html, 'form'),
+                          (ns_html, 'input'),
+                          (ns_html, 'label'),
+                          (ns_html, 'legend'),
+                          (ns_html, 'meter'),
+                          (ns_html, 'optgroup'),
+                          (ns_html, 'option'),
+                          (ns_html, 'output'),
+                          (ns_html, 'progress'),
+                          (ns_html, 'select'),
+                          (ns_html, 'textarea'),
+                          }
+        self.allowed_elements = set(html5lib.filters.sanitizer.allowed_elements) - _form_elements
+
+        # srcset is used in our own project_list/project_summary widgets which are used as macros so go through markdown
+        self.allowed_attributes = html5lib.filters.sanitizer.allowed_attributes | {'srcset'}
+
+        self.valid_iframe_srcs = ('https://www.youtube.com/embed/', 'https://www.gittip.com/')
+        self._prev_token_was_ok_iframe = False
 
     def sanitize_token(self, token):
-        if 'iframe' in self.allowed_elements:
-            self.allowed_elements.remove('iframe')
+        """
+        Allow iframe tags if the src attribute matches our list of valid sources.
+        Otherwise use default sanitization.
+        """
+
+        iframe_el = (html5lib.constants.namespaces['html'], 'iframe')
+        self.allowed_elements.discard(iframe_el)
         ok_opening_iframe = False
 
         if token.get('name') == 'iframe':
-            attrs = dict(token.get('data'))
-            if attrs.get('src', '').startswith(self.valid_iframe_srcs):
-                self.allowed_elements.append('iframe')
+            attrs = token.get('data') or {}
+            if attrs.get((None, 'src'), '').startswith(self.valid_iframe_srcs):
+                self.allowed_elements.add(iframe_el)
                 ok_opening_iframe = True
-            elif token.get('type') == tokenTypes["EndTag"] and self._prev_token_was_ok_iframe:
-                self.allowed_elements.append('iframe')
+            elif token.get('type') == "EndTag" and self._prev_token_was_ok_iframe:
+                self.allowed_elements.add(iframe_el)
 
         self._prev_token_was_ok_iframe = ok_opening_iframe
-        return super(ForgeHTMLSanitizer, self).sanitize_token(token)
+        return super(ForgeHTMLSanitizerFilter, self).sanitize_token(token)
 
 
 def ip_address(request):

http://git-wip-us.apache.org/repos/asf/allura/blob/e67a58af/Allura/allura/tests/test_markdown.py
----------------------------------------------------------------------
diff --git a/Allura/allura/tests/test_markdown.py b/Allura/allura/tests/test_markdown.py
index 3ff53b4..f362359 100644
--- a/Allura/allura/tests/test_markdown.py
+++ b/Allura/allura/tests/test_markdown.py
@@ -136,10 +136,10 @@ Not *strong* or _underlined_."""
         expected_html = """\
 <div class="markdown_content"><p># Not A Heading #<br>
 ---<br>
-* <a href=/p/project/tool/artifact/>#100</a>, <a href=/p/project/tool/artifact/>r2</a><br>
-* <a href=/p/project/tool/artifact/>ticket:100</a><br>
-* <a href=/p/project/tool/artifact/#abc>comment:13:ticket:2</a><br>
-* <a href=/p/project/tool/2/tree/test.py#l3>source:test.py@2#L3</a></p>
+* <a href="/p/project/tool/artifact/">#100</a>, <a href="/p/project/tool/artifact/">r2</a><br>
+* <a href="/p/project/tool/artifact/">ticket:100</a><br>
+* <a href="/p/project/tool/artifact/#abc">comment:13:ticket:2</a><br>
+* <a href="/p/project/tool/2/tree/test.py#l3">source:test.py@2#L3</a></p>
 <p>Not *strong* or _underlined_.</div>"""
 
         md = ForgeMarkdown(

http://git-wip-us.apache.org/repos/asf/allura/blob/e67a58af/Allura/allura/tests/test_utils.py
----------------------------------------------------------------------
diff --git a/Allura/allura/tests/test_utils.py b/Allura/allura/tests/test_utils.py
index 711bfe4..a52d295 100644
--- a/Allura/allura/tests/test_utils.py
+++ b/Allura/allura/tests/test_utils.py
@@ -36,6 +36,8 @@ from nose.tools import (
 from pygments import highlight
 from pygments.lexers import get_lexer_for_filename
 from tg import config
+import html5lib
+import html5lib.treewalkers
 
 from alluratest.controller import setup_unit_test
 
@@ -250,6 +252,12 @@ class TestCodeStats(unittest.TestCase):
 
 class TestHTMLSanitizer(unittest.TestCase):
 
+    def walker_from_text(self, text):
+        parsed = html5lib.parseFragment(text)
+        TreeWalker = html5lib.treewalkers.getTreeWalker("etree")
+        walker = TreeWalker(parsed)
+        return walker
+
     def simple_tag_list(self, sanitizer):
         # no attrs, no close tag flag check, just real simple
         return [
@@ -257,17 +265,20 @@ class TestHTMLSanitizer(unittest.TestCase):
         ]
 
     def test_html_sanitizer_iframe(self):
-        p = utils.ForgeHTMLSanitizer('<div><iframe></iframe></div>')
+        walker = self.walker_from_text('<div><iframe></iframe></div>')
+        p = utils.ForgeHTMLSanitizerFilter(walker)
         assert_equal(self.simple_tag_list(p), ['div', 'div'])
 
     def test_html_sanitizer_youtube_iframe(self):
-        p = utils.ForgeHTMLSanitizer(
+        walker = self.walker_from_text(
             '<div><iframe src="https://www.youtube.com/embed/kOLpSPEA72U?feature=oembed"></iframe></div>')
+        p = utils.ForgeHTMLSanitizerFilter(walker)
         assert_equal(
             self.simple_tag_list(p), ['div', 'iframe', 'iframe', 'div'])
 
     def test_html_sanitizer_form_elements(self):
-        p = utils.ForgeHTMLSanitizer('<p>test</p><form method="post" action="http://localhost/foo.php"><input type=file><input type=text><textarea>asdf</textarea></form>')
+        walker = self.walker_from_text('<p>test</p><form method="post" action="http://localhost/foo.php"><input type=file><input type=text><textarea>asdf</textarea></form>')
+        p = utils.ForgeHTMLSanitizerFilter(walker)
         assert_equal(self.simple_tag_list(p), ['p', 'p'])
 
 

http://git-wip-us.apache.org/repos/asf/allura/blob/e67a58af/requirements.txt
----------------------------------------------------------------------
diff --git a/requirements.txt b/requirements.txt
index 1400afc..bbe46aa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 ActivityStream==0.2.2
 BeautifulSoup==3.2.0
-beautifulsoup4==4.4.0
+beautifulsoup4==4.6.1
 Beaker==1.6.4
 chardet==1.0.1
 colander==0.9.3
@@ -15,7 +15,7 @@ feedparser==5.1.3
 FormEncode==1.2.4
 # dep of Creoleparser
 Genshi==0.6
-html5lib==0.999
+html5lib==1.0.1
 # dep of oauth2
 httplib2==0.7.4
 iso8601==0.1.4
@@ -50,6 +50,8 @@ textile==2.1.5
 translationstring==0.4
 TimerMiddleware==0.4.4
 TurboGears2==2.1.5
+# dep of html5lib
+webencodings==0.5.1
 WebOb==1.0.8
 
 # dependencies for cryptography