summaryrefslogtreecommitdiffstats
path: root/emails/loader/helpers.py
blob: c06d1189980f06f5c2812001b7b4cf3ff6474099 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# encoding: utf-8
from __future__ import unicode_literals
__all__ = ['guess_charset', 'fix_content_type']

import re
import cgi
import charade

from ..compat import to_native, to_unicode

# HTML page charset stuff

class ReRules:
    re_meta = b"(?i)(?<=<meta).*?(?=>)"
    re_is_http_equiv = b"http-equiv=\"?'?content-type\"?'?"
    re_parse_http_equiv = b"content=\"?'?([^\"'>]+)"
    re_charset = b"charset=\"?'?([\w-]+)\"?'?"

    def __init__(self, conv=None):
        if conv is None:
            conv = lambda x: x
        for k in dir(self):
            if k.startswith('re_'):
                setattr(self, k, re.compile(conv(getattr(self, k)), re.I + re.S + re.M))

RULES_U = ReRules(conv=to_unicode)
RULES_B = ReRules()


def guess_text_charset(text, is_html=False):
    if is_html:
        rules = isinstance(text, bytes) and RULES_B or RULES_U
        for meta in rules.re_meta.findall(text):
            if rules.re_is_http_equiv.findall(meta):
                for content in rules.re_parse_http_equiv.findall(meta):
                    for charset in rules.re_charset.findall(content):
                        return to_native(charset)
            else:
                for charset in rules.re_charset.findall(meta):
                    return to_native(charset)
    # guess by chardet
    if isinstance(text, bytes):
        return to_native(charade.detect(text)['encoding'])


def guess_html_charset(html):
    return guess_text_charset(text=html, is_html=True)


def guess_charset(headers, html):

    # guess by http headers
    if headers:
        content_type = headers['content-type']
        if content_type:
            _, params = cgi.parse_header(content_type)
            r = params.get('charset', None)
            if r:
                return r

    # guess by html content
    charset = guess_html_charset(html)
    if charset:
        return to_unicode(charset)

COMMON_CHARSETS = ('ascii', 'utf-8', 'utf-16', 'windows-1251', 'windows-1252', 'cp850')

def decode_text(text,
                is_html=False,
                guess_charset=True,
                try_common_charsets=True,
                charsets=None,
                fallback_charset='utf-8'):

    if not isinstance(text, bytes):
        return text, None

    _charsets = []
    if guess_charset:
        c = guess_text_charset(text, is_html=is_html)
        if c:
            _charsets.append(c)

    if charsets:
        _charsets.extend(charsets)

    if try_common_charsets:
        _charsets.extend(COMMON_CHARSETS)

    if fallback_charset:
        _charsets.append(fallback_charset)

    _last_exc = None
    for enc in _charsets:
        try:
            return to_unicode(text, charset=enc), enc
        except UnicodeDecodeError as exc:
            _last_exc = exc

    raise _last_exc