debian-python-tinycss2/tinycss2/bytes.py

113 lines
4.4 KiB
Python

from webencodings import UTF8, decode, lookup
from .parser import parse_stylesheet
def decode_stylesheet_bytes(css_bytes, protocol_encoding=None,
environment_encoding=None):
"""Determine the character encoding of a CSS stylesheet and decode it.
This is based on the presence of a ,
an ``@charset`` rule,
and encoding meta-information.
:param css_bytes: A byte string.
:param protocol_encoding:
The encoding label, if any, defined by HTTP or equivalent protocol.
(e.g. via the ``charset`` parameter of the ``Content-Type`` header.)
:param environment_encoding:
A :class:`webencodings.Encoding` object
for the `environment encoding
<http://www.w3.org/TR/css-syntax/#environment-encoding>`_,
if any.
:returns:
A 2-tuple of a decoded Unicode string
and the :class:`webencodings.Encoding` object that was used.
"""
# http://dev.w3.org/csswg/css-syntax/#the-input-byte-stream
if protocol_encoding:
fallback = lookup(protocol_encoding)
if fallback:
return decode(css_bytes, fallback)
if css_bytes.startswith(b'@charset "'):
# 10 is len(b'@charset "')
# 100 is arbitrary so that no encoding label is more than 100-10 bytes.
end_quote = css_bytes.find(b'"', 10, 100)
if end_quote != -1 and css_bytes.startswith(b'";', end_quote):
fallback = lookup(css_bytes[10:end_quote].decode('latin1'))
if fallback:
if fallback.name in ('utf-16be', 'utf-16le'):
return decode(css_bytes, UTF8)
return decode(css_bytes, fallback)
if environment_encoding:
return decode(css_bytes, environment_encoding)
return decode(css_bytes, UTF8)
def parse_stylesheet_bytes(css_bytes, protocol_encoding=None,
environment_encoding=None,
skip_comments=False, skip_whitespace=False):
"""Parse :diagram:`stylesheet` from bytes,
determining the character encoding as web browsers do.
This is used when reading a file or fetching an URL.
The character encoding is determined from the initial bytes
(a :abbr:`BOM (Byte Order Mark)` or an ``@charset`` rule)
as well as the parameters.
The ultimate fallback is UTF-8.
:param css_bytes: A byte string.
:param protocol_encoding:
A string.
The encoding label, if any, defined by HTTP or equivalent protocol.
(e.g. via the ``charset`` parameter of the ``Content-Type`` header.)
:param environment_encoding:
A :class:`webencodings.Encoding` object
for the `environment encoding`_,
if any.
:param skip_comments:
Ignore CSS comments at the top-level of the stylesheet.
If the input is a string, ignore all comments.
:param skip_whitespace:
Ignore whitespace at the top-level of the stylesheet.
Whitespace is still preserved
in the :attr:`~tinycss2.ast.QualifiedRule.prelude`
and the :attr:`~tinycss2.ast.QualifiedRule.content` of rules.
:returns:
A ``(rules, encoding)`` tuple.
* :obj:`rules` is a list of
:class:`~tinycss2.ast.QualifiedRule`,
:class:`~tinycss2.ast.AtRule`,
:class:`~tinycss2.ast.Comment` (if ``skip_comments`` is false),
:class:`~tinycss2.ast.WhitespaceToken`
(if ``skip_whitespace`` is false),
and :class:`~tinycss2.ast.ParseError` objects.
* :obj:`encoding` is the :class:`webencodings.Encoding` object
that was used.
If ``rules`` contains an ``@import`` rule, this is
the `environment encoding`_ for the imported stylesheet.
.. _environment encoding:
http://www.w3.org/TR/css-syntax/#environment-encoding
.. code-block:: python
response = urlopen('http://example.net/foo.css')
rules, encoding = parse_stylesheet_bytes(
css_bytes=response.read(),
# Python 3.x
protocol_encoding=response.info().get_content_type().get_param('charset'),
# Python 2.x
protocol_encoding=response.info().gettype().getparam('charset'),
)
for rule in rules:
...
"""
css_unicode, encoding = decode_stylesheet_bytes(
css_bytes, protocol_encoding, environment_encoding)
stylesheet = parse_stylesheet(css_unicode, skip_comments, skip_whitespace)
return stylesheet, encoding