debian-python-tinycss2/tinycss2/bytes.py

from webencodings import UTF8, decode, lookup

from .parser import parse_stylesheet


def decode_stylesheet_bytes(css_bytes, protocol_encoding=None,
                            environment_encoding=None):
    """Determine the character encoding of a CSS stylesheet and decode it.

    This is based on the presence of a ,
    an ``@charset`` rule,
    and encoding meta-information.

    :param css_bytes: A byte string.
    :param protocol_encoding:
        The encoding label, if any, defined by HTTP or equivalent protocol.
        (e.g. via the ``charset`` parameter of the ``Content-Type`` header.)
    :param environment_encoding:
        A :class:`webencodings.Encoding` object
        for the `environment encoding
        <http://www.w3.org/TR/css-syntax/#environment-encoding>`_,
        if any.
    :returns:
        A 2-tuple of a decoded Unicode string
        and the :class:`webencodings.Encoding` object that was used.

    """
    # http://dev.w3.org/csswg/css-syntax/#the-input-byte-stream
    if protocol_encoding:
        fallback = lookup(protocol_encoding)
        if fallback:
            return decode(css_bytes, fallback)
    if css_bytes.startswith(b'@charset "'):
        # 10 is len(b'@charset "')
        # 100 is arbitrary so that no encoding label is more than 100-10 bytes.
        end_quote = css_bytes.find(b'"', 10, 100)
        if end_quote != -1 and css_bytes.startswith(b'";', end_quote):
            fallback = lookup(css_bytes[10:end_quote].decode('latin1'))
            if fallback:
                if fallback.name in ('utf-16be', 'utf-16le'):
                    return decode(css_bytes, UTF8)
                return decode(css_bytes, fallback)
    if environment_encoding:
        return decode(css_bytes, environment_encoding)
    return decode(css_bytes, UTF8)


def parse_stylesheet_bytes(css_bytes, protocol_encoding=None,
                           environment_encoding=None,
                           skip_comments=False, skip_whitespace=False):
    """Parse :diagram:`stylesheet` from bytes,
    determining the character encoding as web browsers do.

    This is used when reading a file or fetching an URL.
    The character encoding is determined from the initial bytes
    (a :abbr:`BOM (Byte Order Mark)` or an ``@charset`` rule)
    as well as the parameters.
    The ultimate fallback is UTF-8.

    :param css_bytes: A byte string.
    :param protocol_encoding:
        A string.
        The encoding label, if any, defined by HTTP or equivalent protocol.
        (e.g. via the ``charset`` parameter of the ``Content-Type`` header.)
    :param environment_encoding:
        A :class:`webencodings.Encoding` object
        for the `environment encoding`_,
        if any.
    :param skip_comments:
        Ignore CSS comments at the top-level of the stylesheet.
        If the input is a string, ignore all comments.
    :param skip_whitespace:
        Ignore whitespace at the top-level of the stylesheet.
        Whitespace is still preserved
        in the :attr:`~tinycss2.ast.QualifiedRule.prelude`
        and the :attr:`~tinycss2.ast.QualifiedRule.content` of rules.
    :returns:
        A ``(rules, encoding)`` tuple.

        * :obj:`rules` is a list of
          :class:`~tinycss2.ast.QualifiedRule`,
          :class:`~tinycss2.ast.AtRule`,
          :class:`~tinycss2.ast.Comment` (if ``skip_comments`` is false),
          :class:`~tinycss2.ast.WhitespaceToken`
          (if ``skip_whitespace`` is false),
          and :class:`~tinycss2.ast.ParseError` objects.
        * :obj:`encoding` is the :class:`webencodings.Encoding` object
          that was used.
          If ``rules`` contains an ``@import`` rule, this is
          the `environment encoding`_ for the imported stylesheet.

    .. _environment encoding:
            http://www.w3.org/TR/css-syntax/#environment-encoding

    .. code-block:: python

        response = urlopen('http://example.net/foo.css')
        rules, encoding = parse_stylesheet_bytes(
            css_bytes=response.read(),
            # Python 3.x
            protocol_encoding=response.info().get_content_type().get_param('charset'),
            # Python 2.x
            protocol_encoding=response.info().gettype().getparam('charset'),
        )
        for rule in rules:
            ...

    """
    css_unicode, encoding = decode_stylesheet_bytes(
        css_bytes, protocol_encoding, environment_encoding)
    stylesheet = parse_stylesheet(css_unicode, skip_comments, skip_whitespace)
    return stylesheet, encoding