debian-pdfrw/tests/test_pdfstring.py

121 lines
3.9 KiB
Python
Executable File

#! /usr/bin/env python
# encoding: utf-8
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
# Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas
# 2016 James Laird-Wah, Sydney, Australia
# MIT license -- See LICENSE.txt for details
'''
Run from the directory above like so:
python -m tests.test_pdfstring
'''
from pdfrw import PdfString
from pdfrw.py23_diffs import convert_store
import unittest
class TestBaseEncoding(unittest.TestCase):
def encode(self, value):
x = PdfString.encode(value)
if isinstance(value, type(u'')):
y = PdfString.from_unicode(value)
else:
y = PdfString.from_bytes(value)
self.assertEqual(x, y)
return x
def decode(self, value):
s = PdfString(value)
x = s.to_unicode()
y = s.decode()
self.assertEqual(x, y)
return x
def decode_bytes(self, decode_this, expected):
""" Decode to bytes"""
self.assertEqual(PdfString(decode_this).to_bytes(),
convert_store(expected))
def roundtrip(self, value, expected=None):
result = self.encode(value)
self.assertEqual(value, self.decode(result))
if expected is not None:
self.assertEqual(result, expected)
return result
def test_doubleslash(self):
self.roundtrip('\\')
self.roundtrip(r'\\')
def test_unicode_encoding(self):
# These chars are in PdfDocEncoding
self.assertEqual(self.roundtrip(u'PDF™©®')[0], '(')
# These chars are not in PdfDocEncoding
self.assertEqual(self.roundtrip(u'δΩσ')[0], '<')
# Check that we're doing a reasonable encoding
# Might want to change this later if we change the definition of reasonable
self.roundtrip(u'(\n\u00FF', '(\\(\n\xff)')
self.roundtrip(u'(\n\u0101', '<FEFF0028000A0101>')
def test_constructor(self):
obj = PdfString('hello')
def test_continuation(self):
# See PDF 1.7 ref section 3.2 page 55
s1 = PdfString('(These two strings are the same.)')
self.assertEqual(s1.decode(), s1[1:-1])
s2 = PdfString('(These \\\ntwo strings \\\nare the same.)')
self.assertEqual(s1.decode(), s2.decode())
s2 = PdfString(s2.replace('\n', '\r'))
self.assertEqual(s1.decode(), s2.decode())
s2 = PdfString(s2.replace('\r', '\r\n'))
self.assertEqual(s1.decode(), s2.decode())
def test_hex_whitespace(self):
# See PDF 1.7 ref section 3.2 page 56
self.assertEqual(self.decode('<41 \n\r\t\f\v42>'), 'AB')
def test_unicode_escaped_decode(self):
# Some PDF producers happily put unicode strings in PdfDocEncoding,
# because the Unicode BOM and \0 are valid code points
decoded = self.decode('(\xfe\xff\0h\0e\0l\0l\0o)')
self.assertEqual(decoded, "hello")
def test_unescaping(self):
self.decode_bytes(r'( \( \) \\ \n \t \f \r \r\n \\n)',
' ( ) \\ \n \t \f \r \r\n \\n')
self.decode_bytes(r'(\b\010\10)', '\b\b\b')
self.decode_bytes('(\\n\n\\r\r\\t\t\\b\b\\f\f()\\1\\23\\0143)',
'\n\n\r\r\t\t\b\b\f\f()\001\023\f3')
self.decode_bytes(r'(\\\nabc)', '\\\nabc')
self.decode_bytes(r'(\ )', ' ')
def test_BOM_variants(self):
self.roundtrip(u'\ufeff', '<FEFFFEFF>')
self.roundtrip(u'\ufffe', '<FEFFFFFE>')
self.roundtrip(u'\xfe\xff', '<FEFF00FE00FF>')
self.roundtrip(u'\xff\xfe', '(\xff\xfe)')
self.assertRaises(UnicodeError, PdfString.from_unicode,
u'þÿ blah', text_encoding='pdfdocencoding')
def test_byte_encode(self):
self.assertEqual(self.encode(b'ABC'), '(ABC)')
def test_nullstring(self):
self.assertEqual(PdfString('<>').to_bytes(), b'')
self.assertEqual(PdfString('()').to_bytes(), b'')
def main():
unittest.main()
if __name__ == '__main__':
main()