This repository has been archived on 2023-02-21. You can view files and clone it, but cannot push or open issues or pull requests.
python-emails/emails/loader/helpers.py

46 lines
1.1 KiB
Python

# encoding: utf-8
from __future__ import unicode_literals
__all__ = ['guess_charset', 'fix_content_type']
import re
import cgi
import chardet
from emails.compat import to_unicode
import logging
# HTML page charset stuff
RE_CHARSET = re.compile(b"charset=\"?'?(.+)\"?'?", re.I + re.S + re.M)
RE_META = re.compile(b"<meta.*?http-equiv=\"?'?content-type\"?'?.*?>", re.I + re.S + re.M)
RE_INSIDE_META = re.compile(b"content=\"?'?([^\"'>]+)", re.I + re.S + re.M)
def fix_content_type(content_type, t='image'):
if not content_type:
return "%s/unknown" % t
else:
return content_type
def guess_charset(headers, html):
# guess by http headers
if headers:
content_type = headers['content-type']
if content_type:
_, params = cgi.parse_header(content_type)
r = params.get('charset', None)
if r:
return r
# guess by html meta
for s in RE_META.findall(html):
for x in RE_INSIDE_META.findall(s):
for charset in RE_CHARSET.findall(x):
return to_unicode(charset)
# guess by chardet
return chardet.detect(html)['encoding']