debian-pdfrw/pdfrw/objects/pdfstring.py

74 lines
2.6 KiB
Python

# A part of pdfrw (pdfrw.googlecode.com)
# Copyright (C) 2006-2012 Patrick Maupin, Austin, Texas
# MIT license -- See LICENSE.txt for details
import re
class PdfString(str):
''' A PdfString is an encoded string. It has a decode
method to get the actual string data out, and there
is an encode class method to create such a string.
Like any PDF object, it could be indirect, but it
defaults to being a direct object.
'''
indirect = False
unescape_dict = {'\\b':'\b', '\\f':'\f', '\\n':'\n',
'\\r':'\r', '\\t':'\t',
'\\\r\n': '', '\\\r':'', '\\\n':'',
'\\\\':'\\', '\\':'',
}
unescape_pattern = r'(\\\\|\\b|\\f|\\n|\\r|\\t|\\\r\n|\\\r|\\\n|\\[0-9]+|\\)'
unescape_func = re.compile(unescape_pattern).split
hex_pattern = '([a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])'
hex_func = re.compile(hex_pattern).split
hex_pattern2 = '([a-fA-F0-9][a-fA-F0-9][a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])'
hex_func2 = re.compile(hex_pattern2).split
hex_funcs = hex_func, hex_func2
def decode_regular(self, remap=chr):
assert self[0] == '(' and self[-1] == ')'
mylist = self.unescape_func(self[1:-1])
result = []
unescape = self.unescape_dict.get
for chunk in mylist:
chunk = unescape(chunk, chunk)
if chunk.startswith('\\') and len(chunk) > 1:
value = int(chunk[1:], 8)
# FIXME: TODO: Handle unicode here
if value > 127:
value = 127
chunk = remap(value)
if chunk:
result.append(chunk)
return ''.join(result)
def decode_hex(self, remap=chr, twobytes=False):
data = ''.join(self.split())
data = self.hex_funcs[twobytes](data)
chars = data[1::2]
other = data[0::2]
assert other[0] == '<' and other[-1] == '>' and ''.join(other) == '<>', self
return ''.join([remap(int(x, 16)) for x in chars])
def decode(self, remap=chr, twobytes=False):
if self.startswith('('):
return self.decode_regular(remap)
else:
return self.decode_hex(remap, twobytes)
def encode(cls, source, usehex=False):
assert not usehex, "Not supported yet"
if isinstance(source, unicode):
source = source.encode('utf-8')
else:
source = str(source)
source = source.replace('\\', '\\\\')
source = source.replace('(', '\\(')
source = source.replace(')', '\\)')
return cls('(' +source + ')')
encode = classmethod(encode)