75 lines
2.6 KiB
Python
75 lines
2.6 KiB
Python
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
|
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
|
# MIT license -- See LICENSE.txt for details
|
|
|
|
import re
|
|
|
|
|
|
class PdfString(str):
|
|
''' A PdfString is an encoded string. It has a decode
|
|
method to get the actual string data out, and there
|
|
is an encode class method to create such a string.
|
|
Like any PDF object, it could be indirect, but it
|
|
defaults to being a direct object.
|
|
'''
|
|
indirect = False
|
|
unescape_dict = {'\\b': '\b', '\\f': '\f', '\\n': '\n',
|
|
'\\r': '\r', '\\t': '\t',
|
|
'\\\r\n': '', '\\\r': '', '\\\n': '',
|
|
'\\\\': '\\', '\\': '',
|
|
}
|
|
unescape_pattern = (r'(\\\\|\\b|\\f|\\n|\\r|\\t'
|
|
r'|\\\r\n|\\\r|\\\n|\\[0-9]+|\\)')
|
|
unescape_func = re.compile(unescape_pattern).split
|
|
|
|
hex_pattern = '([a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])'
|
|
hex_func = re.compile(hex_pattern).split
|
|
|
|
hex_pattern2 = ('([a-fA-F0-9][a-fA-F0-9][a-fA-F0-9][a-fA-F0-9]|'
|
|
'[a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])')
|
|
hex_func2 = re.compile(hex_pattern2).split
|
|
|
|
hex_funcs = hex_func, hex_func2
|
|
|
|
def decode_regular(self, remap=chr):
|
|
assert self[0] == '(' and self[-1] == ')'
|
|
mylist = self.unescape_func(self[1:-1])
|
|
result = []
|
|
unescape = self.unescape_dict.get
|
|
for chunk in mylist:
|
|
chunk = unescape(chunk, chunk)
|
|
if chunk.startswith('\\') and len(chunk) > 1:
|
|
value = int(chunk[1:], 8)
|
|
# FIXME: TODO: Handle unicode here
|
|
if value > 127:
|
|
value = 127
|
|
chunk = remap(value)
|
|
if chunk:
|
|
result.append(chunk)
|
|
return ''.join(result)
|
|
|
|
def decode_hex(self, remap=chr, twobytes=False):
|
|
data = ''.join(self.split())
|
|
data = self.hex_funcs[twobytes](data)
|
|
chars = data[1::2]
|
|
other = data[0::2]
|
|
assert (other[0] == '<' and
|
|
other[-1] == '>' and
|
|
''.join(other) == '<>'), self
|
|
return ''.join([remap(int(x, 16)) for x in chars])
|
|
|
|
def decode(self, remap=chr, twobytes=False):
|
|
if self.startswith('('):
|
|
return self.decode_regular(remap)
|
|
|
|
else:
|
|
return self.decode_hex(remap, twobytes)
|
|
|
|
def encode(cls, source, usehex=False):
|
|
assert not usehex, "Not supported yet"
|
|
source = source.replace('\\', '\\\\')
|
|
source = source.replace('(', '\\(')
|
|
source = source.replace(')', '\\)')
|
|
return cls('(' + source + ')')
|
|
encode = classmethod(encode)
|