You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@allura.apache.org by br...@apache.org on 2018/07/31 19:14:23 UTC
[2/2] allura git commit: [#8221] upgrade html5lib
[#8221] upgrade html5lib
Project: http://git-wip-us.apache.org/repos/asf/allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/allura/commit/e67a58af
Tree: http://git-wip-us.apache.org/repos/asf/allura/tree/e67a58af
Diff: http://git-wip-us.apache.org/repos/asf/allura/diff/e67a58af
Branch: refs/heads/db/8221
Commit: e67a58af802a882e9979816316d19d46c378156d
Parents: 7a35db4
Author: Dave Brondsema <da...@brondsema.net>
Authored: Tue Jul 31 15:14:16 2018 -0400
Committer: Dave Brondsema <da...@brondsema.net>
Committed: Tue Jul 31 15:14:16 2018 -0400
----------------------------------------------------------------------
Allura/allura/lib/markdown_extensions.py | 21 +++++---
Allura/allura/lib/utils.py | 76 ++++++++++++++++-----------
Allura/allura/tests/test_markdown.py | 8 +--
Allura/allura/tests/test_utils.py | 17 ++++--
requirements.txt | 6 ++-
5 files changed, 79 insertions(+), 49 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/allura/blob/e67a58af/Allura/allura/lib/markdown_extensions.py
----------------------------------------------------------------------
diff --git a/Allura/allura/lib/markdown_extensions.py b/Allura/allura/lib/markdown_extensions.py
index c292d66..d87687b 100644
--- a/Allura/allura/lib/markdown_extensions.py
+++ b/Allura/allura/lib/markdown_extensions.py
@@ -29,7 +29,7 @@ import markdown
from . import macro
from . import helpers as h
from allura import model as M
-from allura.lib.utils import ForgeHTMLSanitizer
+from allura.lib.utils import ForgeHTMLSanitizerFilter
log = logging.getLogger(__name__)
@@ -472,13 +472,18 @@ class RelativeLinkRewriter(markdown.postprocessors.Postprocessor):
class HTMLSanitizer(markdown.postprocessors.Postprocessor):
def run(self, text):
- parser = html5lib.HTMLParser(tokenizer=ForgeHTMLSanitizer)
- parsed = parser.parse(text)
- serializer = html5lib.serializer.HTMLSerializer()
- walker = html5lib.getTreeWalker("etree")
- stream = html5lib.filters.alphabeticalattributes.Filter(walker(parsed))
- out = ''.join(serializer.serialize(stream))
- return out
+ parsed = html5lib.parseFragment(text)
+
+ # if we didn't have to customize our sanitization, could just do:
+ # return html5lib.serialize(parsed, sanitize=True)
+
+ # instead we do the same steps as that function,
+ # but add our ForgeHTMLSanitizerFilter instead of sanitize=True which would use the standard one
+ TreeWalker = html5lib.treewalkers.getTreeWalker("etree")
+ walker = TreeWalker(parsed)
+ walker = ForgeHTMLSanitizerFilter(walker) # this is our custom step
+ s = html5lib.serializer.HTMLSerializer()
+ return s.render(walker)
class AutolinkPattern(markdown.inlinepatterns.Pattern):
http://git-wip-us.apache.org/repos/asf/allura/blob/e67a58af/Allura/allura/lib/utils.py
----------------------------------------------------------------------
diff --git a/Allura/allura/lib/utils.py b/Allura/allura/lib/utils.py
index 1ec7f6a..f40d0a4 100644
--- a/Allura/allura/lib/utils.py
+++ b/Allura/allura/lib/utils.py
@@ -15,17 +15,12 @@
# specific language governing permissions and limitations
# under the License.
from contextlib import contextmanager
-
import time
import string
import hashlib
import binascii
import logging.handlers
import codecs
-
-from html5lib.constants import tokenTypes
-
-from ming.odm import session
import os.path
import datetime
import random
@@ -47,11 +42,11 @@ from webhelpers.html import literal
from webob import exc
from pygments.formatters import HtmlFormatter
from setproctitle import getproctitle
-import html5lib.sanitizer
-
+import html5lib.filters.sanitizer
from ew import jinja2_ew as ew
from ming.utils import LazyProperty
from ming.odm.odmsession import ODMCursor
+from ming.odm import session
MARKDOWN_EXTENSIONS = ['.markdown', '.mdown', '.mkdn', '.mkd', '.md']
@@ -570,40 +565,57 @@ def serve_file(fp, filename, content_type, last_modified=None,
return iter(lambda: fp.read(block_size), '')
-class ForgeHTMLSanitizer(html5lib.sanitizer.HTMLSanitizer):
- # remove some elements from the sanitizer whitelist
- # <form> and <input> could be used for a social engineering attack to construct a form
- # others are just unexpected and confusing, and have no need to be used in markdown
- _form_elements = ('button', 'datalist', 'fieldset', 'form', 'input', 'label', 'legend', 'meter', 'optgroup',
- 'option', 'output', 'progress', 'select', 'textarea')
- _forge_acceptable_elements = [e for e in html5lib.sanitizer.HTMLSanitizer.acceptable_elements
- if e not in (_form_elements)]
- allowed_elements = _forge_acceptable_elements \
- + html5lib.sanitizer.HTMLSanitizer.mathml_elements \
- + html5lib.sanitizer.HTMLSanitizer.svg_elements
+class ForgeHTMLSanitizerFilter(html5lib.filters.sanitizer.Filter):
- # srcset is used in our own project_list/project_summary widgets which are used as macros so go through markdown
- allowed_attributes = html5lib.sanitizer.HTMLSanitizer.allowed_attributes + ['srcset']
-
- valid_iframe_srcs = ('https://www.youtube.com/embed/', 'https://www.gittip.com/')
-
- _prev_token_was_ok_iframe = False
+ def __init__(self, *args, **kwargs):
+ super(ForgeHTMLSanitizerFilter, self).__init__(*args, **kwargs)
+ # remove some elements from the sanitizer whitelist
+ # <form> and <input> could be used for a social engineering attack to construct a form
+ # others are just unexpected and confusing, and have no need to be used in markdown
+ ns_html = html5lib.constants.namespaces['html']
+ _form_elements = {(ns_html, 'button'),
+ (ns_html, 'datalist'),
+ (ns_html, 'fieldset'),
+ (ns_html, 'form'),
+ (ns_html, 'input'),
+ (ns_html, 'label'),
+ (ns_html, 'legend'),
+ (ns_html, 'meter'),
+ (ns_html, 'optgroup'),
+ (ns_html, 'option'),
+ (ns_html, 'output'),
+ (ns_html, 'progress'),
+ (ns_html, 'select'),
+ (ns_html, 'textarea'),
+ }
+ self.allowed_elements = set(html5lib.filters.sanitizer.allowed_elements) - _form_elements
+
+ # srcset is used in our own project_list/project_summary widgets which are used as macros so go through markdown
+ self.allowed_attributes = html5lib.filters.sanitizer.allowed_attributes | {'srcset'}
+
+ self.valid_iframe_srcs = ('https://www.youtube.com/embed/', 'https://www.gittip.com/')
+ self._prev_token_was_ok_iframe = False
def sanitize_token(self, token):
- if 'iframe' in self.allowed_elements:
- self.allowed_elements.remove('iframe')
+ """
+ Allow iframe tags if the src attribute matches our list of valid sources.
+ Otherwise use default sanitization.
+ """
+
+ iframe_el = (html5lib.constants.namespaces['html'], 'iframe')
+ self.allowed_elements.discard(iframe_el)
ok_opening_iframe = False
if token.get('name') == 'iframe':
- attrs = dict(token.get('data'))
- if attrs.get('src', '').startswith(self.valid_iframe_srcs):
- self.allowed_elements.append('iframe')
+ attrs = token.get('data') or {}
+ if attrs.get((None, 'src'), '').startswith(self.valid_iframe_srcs):
+ self.allowed_elements.add(iframe_el)
ok_opening_iframe = True
- elif token.get('type') == tokenTypes["EndTag"] and self._prev_token_was_ok_iframe:
- self.allowed_elements.append('iframe')
+ elif token.get('type') == "EndTag" and self._prev_token_was_ok_iframe:
+ self.allowed_elements.add(iframe_el)
self._prev_token_was_ok_iframe = ok_opening_iframe
- return super(ForgeHTMLSanitizer, self).sanitize_token(token)
+ return super(ForgeHTMLSanitizerFilter, self).sanitize_token(token)
def ip_address(request):
http://git-wip-us.apache.org/repos/asf/allura/blob/e67a58af/Allura/allura/tests/test_markdown.py
----------------------------------------------------------------------
diff --git a/Allura/allura/tests/test_markdown.py b/Allura/allura/tests/test_markdown.py
index 3ff53b4..f362359 100644
--- a/Allura/allura/tests/test_markdown.py
+++ b/Allura/allura/tests/test_markdown.py
@@ -136,10 +136,10 @@ Not *strong* or _underlined_."""
expected_html = """\
<div class="markdown_content"><p># Not A Heading #<br>
---<br>
-* <a href=/p/project/tool/artifact/>#100</a>, <a href=/p/project/tool/artifact/>r2</a><br>
-* <a href=/p/project/tool/artifact/>ticket:100</a><br>
-* <a href=/p/project/tool/artifact/#abc>comment:13:ticket:2</a><br>
-* <a href=/p/project/tool/2/tree/test.py#l3>source:test.py@2#L3</a></p>
+* <a href="/p/project/tool/artifact/">#100</a>, <a href="/p/project/tool/artifact/">r2</a><br>
+* <a href="/p/project/tool/artifact/">ticket:100</a><br>
+* <a href="/p/project/tool/artifact/#abc">comment:13:ticket:2</a><br>
+* <a href="/p/project/tool/2/tree/test.py#l3">source:test.py@2#L3</a></p>
<p>Not *strong* or _underlined_.</div>"""
md = ForgeMarkdown(
http://git-wip-us.apache.org/repos/asf/allura/blob/e67a58af/Allura/allura/tests/test_utils.py
----------------------------------------------------------------------
diff --git a/Allura/allura/tests/test_utils.py b/Allura/allura/tests/test_utils.py
index 711bfe4..a52d295 100644
--- a/Allura/allura/tests/test_utils.py
+++ b/Allura/allura/tests/test_utils.py
@@ -36,6 +36,8 @@ from nose.tools import (
from pygments import highlight
from pygments.lexers import get_lexer_for_filename
from tg import config
+import html5lib
+import html5lib.treewalkers
from alluratest.controller import setup_unit_test
@@ -250,6 +252,12 @@ class TestCodeStats(unittest.TestCase):
class TestHTMLSanitizer(unittest.TestCase):
+ def walker_from_text(self, text):
+ parsed = html5lib.parseFragment(text)
+ TreeWalker = html5lib.treewalkers.getTreeWalker("etree")
+ walker = TreeWalker(parsed)
+ return walker
+
def simple_tag_list(self, sanitizer):
# no attrs, no close tag flag check, just real simple
return [
@@ -257,17 +265,20 @@ class TestHTMLSanitizer(unittest.TestCase):
]
def test_html_sanitizer_iframe(self):
- p = utils.ForgeHTMLSanitizer('<div><iframe></iframe></div>')
+ walker = self.walker_from_text('<div><iframe></iframe></div>')
+ p = utils.ForgeHTMLSanitizerFilter(walker)
assert_equal(self.simple_tag_list(p), ['div', 'div'])
def test_html_sanitizer_youtube_iframe(self):
- p = utils.ForgeHTMLSanitizer(
+ walker = self.walker_from_text(
'<div><iframe src="https://www.youtube.com/embed/kOLpSPEA72U?feature=oembed"></iframe></div>')
+ p = utils.ForgeHTMLSanitizerFilter(walker)
assert_equal(
self.simple_tag_list(p), ['div', 'iframe', 'iframe', 'div'])
def test_html_sanitizer_form_elements(self):
- p = utils.ForgeHTMLSanitizer('<p>test</p><form method="post" action="http://localhost/foo.php"><input type=file><input type=text><textarea>asdf</textarea></form>')
+ walker = self.walker_from_text('<p>test</p><form method="post" action="http://localhost/foo.php"><input type=file><input type=text><textarea>asdf</textarea></form>')
+ p = utils.ForgeHTMLSanitizerFilter(walker)
assert_equal(self.simple_tag_list(p), ['p', 'p'])
http://git-wip-us.apache.org/repos/asf/allura/blob/e67a58af/requirements.txt
----------------------------------------------------------------------
diff --git a/requirements.txt b/requirements.txt
index 1400afc..bbe46aa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
ActivityStream==0.2.2
BeautifulSoup==3.2.0
-beautifulsoup4==4.4.0
+beautifulsoup4==4.6.1
Beaker==1.6.4
chardet==1.0.1
colander==0.9.3
@@ -15,7 +15,7 @@ feedparser==5.1.3
FormEncode==1.2.4
# dep of Creoleparser
Genshi==0.6
-html5lib==0.999
+html5lib==1.0.1
# dep of oauth2
httplib2==0.7.4
iso8601==0.1.4
@@ -50,6 +50,8 @@ textile==2.1.5
translationstring==0.4
TimerMiddleware==0.4.4
TurboGears2==2.1.5
+# dep of html5lib
+webencodings==0.5.1
WebOb==1.0.8
# dependencies for cryptography