setup py
This commit is contained in:
parent
8774a508f4
commit
0e46f54108
|
@ -1,10 +1,13 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from html5lib import sanitizer, serializer, treebuilders, treewalkers
|
||||
import html5lib
|
||||
import re
|
||||
|
||||
DEFAULT_PARSER = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer,
|
||||
tree=treebuilders.getTreeBuilder("dom"))
|
||||
|
||||
RE_IMG = re.compile(r'<img[^>]*\ssrc="(.*?)"', re.IGNORECASE)
|
||||
|
||||
def clean_html(data, full=True, parser=DEFAULT_PARSER):
|
||||
"""
|
||||
Cleans HTML from XSS vulnerabilities using html5lib
|
||||
|
@ -12,6 +15,7 @@ def clean_html(data, full=True, parser=DEFAULT_PARSER):
|
|||
If full is False, only the contents inside <body> will be returned (without
|
||||
the <body> tags).
|
||||
"""
|
||||
data = extract_images(data)
|
||||
if full:
|
||||
dom_tree = parser.parse(data)
|
||||
else:
|
||||
|
@ -21,3 +25,9 @@ def clean_html(data, full=True, parser=DEFAULT_PARSER):
|
|||
s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,
|
||||
quote_attr_values=True)
|
||||
return u''.join(s.serialize(stream))
|
||||
|
||||
def extract_images(data):
|
||||
matches = RE_IMG.match(data)
|
||||
print "extract images"
|
||||
print matches
|
||||
return matches
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
transifex-client
|
||||
django-cms>=2.3
|
||||
html5lib
|
||||
|
|
Reference in New Issue