This commit is contained in:
Patrick Lauber 2012-12-10 16:58:44 +01:00
parent 8774a508f4
commit 0e46f54108
3 changed files with 12 additions and 1 deletions

View File

@ -1,10 +1,13 @@
# -*- coding: utf-8 -*-
from html5lib import sanitizer, serializer, treebuilders, treewalkers
import html5lib
import re
DEFAULT_PARSER = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer,
tree=treebuilders.getTreeBuilder("dom"))
RE_IMG = re.compile(r'<img[^>]*\ssrc="(.*?)"', re.IGNORECASE)
def clean_html(data, full=True, parser=DEFAULT_PARSER):
"""
Cleans HTML from XSS vulnerabilities using html5lib
@ -12,6 +15,7 @@ def clean_html(data, full=True, parser=DEFAULT_PARSER):
If full is False, only the contents inside <body> will be returned (without
the <body> tags).
"""
data = extract_images(data)
if full:
dom_tree = parser.parse(data)
else:
@ -21,3 +25,9 @@ def clean_html(data, full=True, parser=DEFAULT_PARSER):
s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,
quote_attr_values=True)
return u''.join(s.serialize(stream))
def extract_images(data):
matches = RE_IMG.match(data)
print "extract images"
print matches
return matches

View File

@ -1,2 +1,3 @@
transifex-client
django-cms>=2.3
html5lib

View File

@ -5,7 +5,7 @@ from djangocms_text_ckeditor import __version__
INSTALL_REQUIRES = [
'html5lib',
]
CLASSIFIERS = [