debian-asn1crypto/asn1crypto/_iri.py

# coding: utf-8

"""
Functions to convert unicode IRIs into ASCII byte string URIs and back. Exports
the following items:

 - iri_to_uri()
 - uri_to_iri()
"""

from __future__ import unicode_literals, division, absolute_import, print_function

from encodings import idna  # noqa
import codecs
import re
import sys

from ._errors import unwrap
from ._types import byte_cls, str_cls, type_name, bytes_to_list, int_types

if sys.version_info < (3,):
    from urlparse import urlsplit, urlunsplit
    from urllib import (
        quote as urlquote,
        unquote as unquote_to_bytes,
    )

else:
    from urllib.parse import (
        quote as urlquote,
        unquote_to_bytes,
        urlsplit,
        urlunsplit,
    )


def iri_to_uri(value):
    """
    Normalizes and encodes a unicode IRI into an ASCII byte string URI

    :param value:
        A unicode string of an IRI

    :return:
        A byte string of the ASCII-encoded URI
    """

    if not isinstance(value, str_cls):
        raise TypeError(unwrap(
            '''
            value must be a unicode string, not %s
            ''',
            type_name(value)
        ))

    scheme = None
    # Python 2.6 doesn't split properly is the URL doesn't start with http:// or https://
    if sys.version_info < (2, 7) and not value.startswith('http://') and not value.startswith('https://'):
        real_prefix = None
        prefix_match = re.match('^[^:]*://', value)
        if prefix_match:
            real_prefix = prefix_match.group(0)
            value = 'http://' + value[len(real_prefix):]
        parsed = urlsplit(value)
        if real_prefix:
            value = real_prefix + value[7:]
            scheme = _urlquote(real_prefix[:-3])
    else:
        parsed = urlsplit(value)

    if scheme is None:
        scheme = _urlquote(parsed.scheme)
    hostname = parsed.hostname
    if hostname is not None:
        hostname = hostname.encode('idna')
    # RFC 3986 allows userinfo to contain sub-delims
    username = _urlquote(parsed.username, safe='!$&\'()*+,;=')
    password = _urlquote(parsed.password, safe='!$&\'()*+,;=')
    port = parsed.port
    if port is not None:
        port = str_cls(port).encode('ascii')

    netloc = b''
    if username is not None:
        netloc += username
        if password:
            netloc += b':' + password
        netloc += b'@'
    if hostname is not None:
        netloc += hostname
    if port is not None:
        default_http = scheme == b'http' and port == b'80'
        default_https = scheme == b'https' and port == b'443'
        if not default_http and not default_https:
            netloc += b':' + port

    # RFC 3986 allows a path to contain sub-delims, plus "@" and ":"
    path = _urlquote(parsed.path, safe='/!$&\'()*+,;=@:')
    # RFC 3986 allows the query to contain sub-delims, plus "@", ":" , "/" and "?"
    query = _urlquote(parsed.query, safe='/?!$&\'()*+,;=@:')
    # RFC 3986 allows the fragment to contain sub-delims, plus "@", ":" , "/" and "?"
    fragment = _urlquote(parsed.fragment, safe='/?!$&\'()*+,;=@:')

    if query is None and fragment is None and path == b'/':
        path = None

    # Python 2.7 compat
    if path is None:
        path = ''

    output = urlunsplit((scheme, netloc, path, query, fragment))
    if isinstance(output, str_cls):
        output = output.encode('latin1')
    return output


def uri_to_iri(value):
    """
    Converts an ASCII URI byte string into a unicode IRI

    :param value:
        An ASCII-encoded byte string of the URI

    :return:
        A unicode string of the IRI
    """

    if not isinstance(value, byte_cls):
        raise TypeError(unwrap(
            '''
            value must be a byte string, not %s
            ''',
            type_name(value)
        ))

    parsed = urlsplit(value)

    scheme = parsed.scheme
    if scheme is not None:
        scheme = scheme.decode('ascii')

    username = _urlunquote(parsed.username, remap=[':', '@'])
    password = _urlunquote(parsed.password, remap=[':', '@'])
    hostname = parsed.hostname
    if hostname:
        hostname = hostname.decode('idna')
    port = parsed.port
    if port and not isinstance(port, int_types):
        port = port.decode('ascii')

    netloc = ''
    if username is not None:
        netloc += username
        if password:
            netloc += ':' + password
        netloc += '@'
    if hostname is not None:
        netloc += hostname
    if port is not None:
        netloc += ':' + str_cls(port)

    path = _urlunquote(parsed.path, remap=['/'], preserve=True)
    query = _urlunquote(parsed.query, remap=['&', '='], preserve=True)
    fragment = _urlunquote(parsed.fragment)

    return urlunsplit((scheme, netloc, path, query, fragment))


def _iri_utf8_errors_handler(exc):
    """
    Error handler for decoding UTF-8 parts of a URI into an IRI. Leaves byte
    sequences encoded in %XX format, but as part of a unicode string.

    :param exc:
        The UnicodeDecodeError exception

    :return:
        A 2-element tuple of (replacement unicode string, integer index to
        resume at)
    """

    bytes_as_ints = bytes_to_list(exc.object[exc.start:exc.end])
    replacements = ['%%%02x' % num for num in bytes_as_ints]
    return (''.join(replacements), exc.end)


codecs.register_error('iriutf8', _iri_utf8_errors_handler)


def _urlquote(string, safe=''):
    """
    Quotes a unicode string for use in a URL

    :param string:
        A unicode string

    :param safe:
        A unicode string of character to not encode

    :return:
        None (if string is None) or an ASCII byte string of the quoted string
    """

    if string is None or string == '':
        return None

    # Anything already hex quoted is pulled out of the URL and unquoted if
    # possible
    escapes = []
    if re.search('%[0-9a-fA-F]{2}', string):
        # Try to unquote any percent values, restoring them if they are not
        # valid UTF-8. Also, requote any safe chars since encoded versions of
        # those are functionally different than the unquoted ones.
        def _try_unescape(match):
            byte_string = unquote_to_bytes(match.group(0))
            unicode_string = byte_string.decode('utf-8', 'iriutf8')
            for safe_char in list(safe):
                unicode_string = unicode_string.replace(safe_char, '%%%02x' % ord(safe_char))
            return unicode_string
        string = re.sub('(?:%[0-9a-fA-F]{2})+', _try_unescape, string)

        # Once we have the minimal set of hex quoted values, removed them from
        # the string so that they are not double quoted
        def _extract_escape(match):
            escapes.append(match.group(0).encode('ascii'))
            return '\x00'
        string = re.sub('%[0-9a-fA-F]{2}', _extract_escape, string)

    output = urlquote(string.encode('utf-8'), safe=safe.encode('utf-8'))
    if not isinstance(output, byte_cls):
        output = output.encode('ascii')

    # Restore the existing quoted values that we extracted
    if len(escapes) > 0:
        def _return_escape(_):
            return escapes.pop(0)
        output = re.sub(b'%00', _return_escape, output)

    return output


def _urlunquote(byte_string, remap=None, preserve=None):
    """
    Unquotes a URI portion from a byte string into unicode using UTF-8

    :param byte_string:
        A byte string of the data to unquote

    :param remap:
        A list of characters (as unicode) that should be re-mapped to a
        %XX encoding. This is used when characters are not valid in part of a
        URL.

    :param preserve:
        A bool - indicates that the chars to be remapped if they occur in
        non-hex form, should be preserved. E.g. / for URL path.

    :return:
        A unicode string
    """

    if byte_string is None:
        return byte_string

    if byte_string == b'':
        return ''

    if preserve:
        replacements = ['\x1A', '\x1C', '\x1D', '\x1E', '\x1F']
        preserve_unmap = {}
        for char in remap:
            replacement = replacements.pop(0)
            preserve_unmap[replacement] = char
            byte_string = byte_string.replace(char.encode('ascii'), replacement.encode('ascii'))

    byte_string = unquote_to_bytes(byte_string)

    if remap:
        for char in remap:
            byte_string = byte_string.replace(char.encode('ascii'), ('%%%02x' % ord(char)).encode('ascii'))

    output = byte_string.decode('utf-8', 'iriutf8')

    if preserve:
        for replacement, original in preserve_unmap.items():
            output = output.replace(replacement, original)

    return output