From 0e46f54108e04a535de5a951d96216dc1aece5ab Mon Sep 17 00:00:00 2001 From: Patrick Lauber Date: Mon, 10 Dec 2012 16:58:44 +0100 Subject: [PATCH] setup py --- djangocms_text_ckeditor/html.py | 10 ++++++++++ requirements.txt | 1 + setup.py | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/djangocms_text_ckeditor/html.py b/djangocms_text_ckeditor/html.py index 1af3abf..9cb04b1 100644 --- a/djangocms_text_ckeditor/html.py +++ b/djangocms_text_ckeditor/html.py @@ -1,10 +1,13 @@ # -*- coding: utf-8 -*- from html5lib import sanitizer, serializer, treebuilders, treewalkers import html5lib +import re DEFAULT_PARSER = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) +RE_IMG = re.compile(r']*\ssrc="(.*?)"', re.IGNORECASE) + def clean_html(data, full=True, parser=DEFAULT_PARSER): """ Cleans HTML from XSS vulnerabilities using html5lib @@ -12,6 +15,7 @@ def clean_html(data, full=True, parser=DEFAULT_PARSER): If full is False, only the contents inside will be returned (without the tags). """ + data = extract_images(data) if full: dom_tree = parser.parse(data) else: @@ -21,3 +25,9 @@ def clean_html(data, full=True, parser=DEFAULT_PARSER): s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) return u''.join(s.serialize(stream)) + +def extract_images(data): + matches = RE_IMG.match(data) + print "extract images" + print matches + return matches diff --git a/requirements.txt b/requirements.txt index 275b915..8d0670d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ transifex-client django-cms>=2.3 +html5lib diff --git a/setup.py b/setup.py index 347322b..10b4f95 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from djangocms_text_ckeditor import __version__ INSTALL_REQUIRES = [ - + 'html5lib', ] CLASSIFIERS = [