debian-pdfrw/pdfrw/objects/pdfstring.py

# A part of pdfrw (pdfrw.googlecode.com)
# Copyright (C) 2006-2012 Patrick Maupin, Austin, Texas
# MIT license -- See LICENSE.txt for details

import re

class PdfString(str):
    ''' A PdfString is an encoded string.  It has a decode
        method to get the actual string data out, and there
        is an encode class method to create such a string.
        Like any PDF object, it could be indirect, but it
        defaults to being a direct object.
    '''
    indirect = False
    unescape_dict = {'\\b':'\b', '\\f':'\f', '\\n':'\n',
                     '\\r':'\r', '\\t':'\t',
                     '\\\r\n': '', '\\\r':'', '\\\n':'',
                     '\\\\':'\\', '\\':'',
                    }
    unescape_pattern = r'(\\\\|\\b|\\f|\\n|\\r|\\t|\\\r\n|\\\r|\\\n|\\[0-9]+|\\)'
    unescape_func = re.compile(unescape_pattern).split

    hex_pattern = '([a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])'
    hex_func = re.compile(hex_pattern).split

    hex_pattern2 = '([a-fA-F0-9][a-fA-F0-9][a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])'
    hex_func2 = re.compile(hex_pattern2).split

    hex_funcs = hex_func, hex_func2

    def decode_regular(self, remap=chr):
        assert self[0] == '(' and self[-1] == ')'
        mylist = self.unescape_func(self[1:-1])
        result = []
        unescape = self.unescape_dict.get
        for chunk in mylist:
            chunk = unescape(chunk, chunk)
            if chunk.startswith('\\') and len(chunk) > 1:
                value = int(chunk[1:], 8)
                # FIXME: TODO: Handle unicode here
                if value > 127:
                    value = 127
                chunk = remap(value)
            if chunk:
                result.append(chunk)
        return ''.join(result)

    def decode_hex(self, remap=chr, twobytes=False):
        data = ''.join(self.split())
        data = self.hex_funcs[twobytes](data)
        chars = data[1::2]
        other = data[0::2]
        assert other[0] == '<' and other[-1] == '>' and ''.join(other) == '<>', self
        return ''.join([remap(int(x, 16)) for x in chars])

    def decode(self, remap=chr, twobytes=False):
        if self.startswith('('):
            return self.decode_regular(remap)

        else:
            return self.decode_hex(remap, twobytes)

    def encode(cls, source, usehex=False):
        assert not usehex, "Not supported yet"
        if isinstance(source, unicode):
            source = source.encode('utf-8')
        else:
            source = str(source)
        source = source.replace('\\', '\\\\')
        source = source.replace('(', '\\(')
        source = source.replace(')', '\\)')
        return cls('(' +source + ')')
    encode = classmethod(encode)