74 lines
2.6 KiB
Python
74 lines
2.6 KiB
Python
# A part of pdfrw (pdfrw.googlecode.com)
|
|
# Copyright (C) 2006-2012 Patrick Maupin, Austin, Texas
|
|
# MIT license -- See LICENSE.txt for details
|
|
|
|
import re
|
|
|
|
class PdfString(str):
|
|
''' A PdfString is an encoded string. It has a decode
|
|
method to get the actual string data out, and there
|
|
is an encode class method to create such a string.
|
|
Like any PDF object, it could be indirect, but it
|
|
defaults to being a direct object.
|
|
'''
|
|
indirect = False
|
|
unescape_dict = {'\\b':'\b', '\\f':'\f', '\\n':'\n',
|
|
'\\r':'\r', '\\t':'\t',
|
|
'\\\r\n': '', '\\\r':'', '\\\n':'',
|
|
'\\\\':'\\', '\\':'',
|
|
}
|
|
unescape_pattern = r'(\\\\|\\b|\\f|\\n|\\r|\\t|\\\r\n|\\\r|\\\n|\\[0-9]+|\\)'
|
|
unescape_func = re.compile(unescape_pattern).split
|
|
|
|
hex_pattern = '([a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])'
|
|
hex_func = re.compile(hex_pattern).split
|
|
|
|
hex_pattern2 = '([a-fA-F0-9][a-fA-F0-9][a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])'
|
|
hex_func2 = re.compile(hex_pattern2).split
|
|
|
|
hex_funcs = hex_func, hex_func2
|
|
|
|
def decode_regular(self, remap=chr):
|
|
assert self[0] == '(' and self[-1] == ')'
|
|
mylist = self.unescape_func(self[1:-1])
|
|
result = []
|
|
unescape = self.unescape_dict.get
|
|
for chunk in mylist:
|
|
chunk = unescape(chunk, chunk)
|
|
if chunk.startswith('\\') and len(chunk) > 1:
|
|
value = int(chunk[1:], 8)
|
|
# FIXME: TODO: Handle unicode here
|
|
if value > 127:
|
|
value = 127
|
|
chunk = remap(value)
|
|
if chunk:
|
|
result.append(chunk)
|
|
return ''.join(result)
|
|
|
|
def decode_hex(self, remap=chr, twobytes=False):
|
|
data = ''.join(self.split())
|
|
data = self.hex_funcs[twobytes](data)
|
|
chars = data[1::2]
|
|
other = data[0::2]
|
|
assert other[0] == '<' and other[-1] == '>' and ''.join(other) == '<>', self
|
|
return ''.join([remap(int(x, 16)) for x in chars])
|
|
|
|
def decode(self, remap=chr, twobytes=False):
|
|
if self.startswith('('):
|
|
return self.decode_regular(remap)
|
|
|
|
else:
|
|
return self.decode_hex(remap, twobytes)
|
|
|
|
def encode(cls, source, usehex=False):
|
|
assert not usehex, "Not supported yet"
|
|
if isinstance(source, unicode):
|
|
source = source.encode('utf-8')
|
|
else:
|
|
source = str(source)
|
|
source = source.replace('\\', '\\\\')
|
|
source = source.replace('(', '\\(')
|
|
source = source.replace(')', '\\)')
|
|
return cls('(' +source + ')')
|
|
encode = classmethod(encode)
|