diff --git a/.gitignore b/.gitignore
index 6260e55..b4c3391 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,17 @@
+# OSX
+.DS_Store
+.AppleDouble
+.LSOverride
+Icon
+
+# Thumbnails
+._*
+
+# Files that might appear on external disk
+.Spotlight-V100
+.Trashes
+
+
# Development artifacts
diffs.txt
examples/*.pdf
@@ -9,6 +23,7 @@ tests/pdfrw
tests/static_pdfs
tests/ramdisk
tests/saved_results
+tests/tmp_results
wiki/
diff --git a/.travis.yml b/.travis.yml
index caa88f5..dcdd573 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,6 +4,8 @@ python:
- "2.7"
- "3.3"
- "3.4"
+ - "3.5"
+ - "3.6"
- "nightly"
# command to install dependencies
before_install:
@@ -11,6 +13,7 @@ before_install:
install:
- "pip install ."
- "pip install reportlab || true"
+ - "pip install PyCrypto || true"
- "pip install zlib || true"
- "pip install unittest2 || true"
# command to run tests
diff --git a/LICENSE.txt b/LICENSE.txt
index 8d3c13d..e176dc4 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -8,10 +8,22 @@ Mathieu Fenniak and licensed under the BSD license (also reproduced below).
Please add any missing authors here:
-Copyright (c) 2006-2015 Patrick Maupin. All rights reserved.
+Copyright (c) 2006-2017 Patrick Maupin. All rights reserved.
Copyright (c) 2006 Mathieu Fenniak. All rights reserved.
Copyright (c) 2010 Attila Tajti. All rights reserved.
Copyright (c) 2012 Nerijus Mika. All rights reserved.
+Copyright (c) 2015 Bastien Gandouet. All rights reserved.
+Copyright (c) 2015 Tzerjen Wei. All rights reserved.
+Copyright (c) 2015 Jorj X. McKie. All rights reserved.
+Copyright (c) 2015 Nicholas Devenish. All rights reserved.
+Copyright (c) 2015-2016 Jonatan Dellagostin. All rights reserved.
+Copyright (c) 2016-2017 Thomas Kluyver. All rights reserved.
+Copyright (c) 2016 James Laird-Wah. All rights reserved.
+Copyright (c) 2016 Marcus Brinkmann. All rights reserved.
+Copyright (c) 2016 Edward Betts. All rights reserved.
+Copyright (c) 2016 Patrick Mazulo. All rights reserved.
+Copyright (c) 2017 Haochen Wu. All rights reserved.
+Copyright (c) 2017 Jon Lund Steffensen. All rights reserved.
MIT License:
diff --git a/MANIFEST.in b/MANIFEST.in
index f90ac68..493839e 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1,3 @@
include *.txt *.in *.rst
recursive-include examples *.txt *.py
+recursive-include tests *.py
diff --git a/README.rst b/README.rst
index 2c91345..dc4a52c 100644
--- a/README.rst
+++ b/README.rst
@@ -1,6 +1,6 @@
-=============
-pdfrw 0.2b1
-=============
+==================
+pdfrw 0.4
+==================
:Author: Patrick Maupin
@@ -14,7 +14,7 @@ Introduction
**pdfrw** is a Python library and utility that reads and writes PDF files:
-* Version 0.2 is tested and works on Python 2.6, 2.7, 3.3, and 3.4.
+* Version 0.4 is tested and works on Python 2.6, 2.7, 3.3, 3.4, 3.5, and 3.6
* Operations include subsetting, merging, rotating, modifying metadata, etc.
* The fastest pure Python PDF parser available
* Has been used for years by a printer in pre-press production
@@ -74,10 +74,13 @@ try to use pdftk to uncompress and/or unencrypt them first.
output.
* `rl1/subset.py`__ Another subsetting example, using reportlab canvas for
output.
-* `rl1/platypus_pdf_template.py`__ Aother watermarking example, using
+* `rl1/platypus_pdf_template.py`__ Another watermarking example, using
reportlab canvas and generated output for the document. Contributed
by user asannes.
* `rl2`__ Experimental code for parsing graphics. Needs work.
+* `subset_booklets.py`__ shows an example of creating a full printable pdf
+ version in a more professional and pratical way ( take a look at
+ http://www.wikihow.com/Bind-a-Book )
__ https://github.com/pmaupin/pdfrw/tree/master/examples/4up.py
__ https://github.com/pmaupin/pdfrw/tree/master/examples/alter.py
@@ -95,6 +98,7 @@ __ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/booklet.py
__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/subset.py
__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/platypus_pdf_template.py
__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl2/
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/subset_booklets.py
Notes on selected examples
------------------------------------
@@ -715,6 +719,8 @@ non-pure-Python libraries
files.
- `pycairo `__ can write PDF
files.
+- `PyMuPDF `_ high performance rendering
+ of PDF, (Open)XPS, CBZ and EPUB
Other tools
-----------
@@ -723,12 +729,50 @@ Other tools
line tool for basic PDF manipulation. It complements pdfrw extremely
well, supporting many operations such as decryption and decompression
that pdfrw cannot do.
+- `MuPDF `_ is a free top performance PDF, (Open)XPS, CBZ and EPUB rendering library
+ that also comes with some command line tools. One of those, ``mutool``, has big overlaps with pdftk's -
+ except it is up to 10 times faster.
Release information
=======================
Revisions:
+0.4 -- Released 18 September, 2017
+
+ - Python 3.6 added to test matrix
+ - Proper unicode support for text strings in PDFs added
+ - buildxobj fixes allow better support creating form XObjects
+ out of compressed pages in some cases
+ - Compression fixes for Python 3+
+ - New subset_booklets.py example
+ - Bug with non-compressed indices into compressed object streams fixed
+ - Bug with distinguishing compressed object stream first objects fixed
+ - Better error reporting added for some invalid PDFs (e.g. when reading
+ past the end of file)
+ - Better scrubbing of old bookmark information when writing PDFs, to
+ remove dangling references
+ - Refactoring of pdfwriter, including updating API, to allow future
+ enhancements for things like incremental writing
+ - Minor tokenizer speedup
+ - Some flate decompressor bugs fixed
+ - Compression and decompression tests added
+ - Tests for new unicode handling added
+ - PdfReader.readpages() recursion error (issue #92) fixed.
+ - Initial crypt filter support added
+
+
+0.3 -- Released 19 October, 2016.
+
+ - Python 3.5 added to test matrix
+ - Better support under Python 3.x for in-memory PDF file-like objects
+ - Some pagemerge and Unicode patches added
+ - Changes to logging allow better coexistence with other packages
+ - Fix for "from pdfrw import \*"
+ - New fancy_watermark.py example shows off capabilities of pagemerge.py
+ - metadata.py example renamed to cat.py
+
+
0.2 -- Released 21 June, 2015. Supports Python 2.6, 2.7, 3.3, and 3.4.
- Several bugs have been fixed
diff --git a/debian/changelog b/debian/changelog
index a0836e5..3c09d9c 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,32 @@
+pdfrw (0.4-2) unstable; urgency=medium
+
+ * Bumped Standards-Version to 4.1.3
+ * Replaced python-reportlab in python3-pdfrw by python3-reportlab
+
+ -- Rodrigo Siqueira Thu, 12 Apr 2018 12:14:12 -0300
+
+pdfrw (0.4-1) unstable; urgency=medium
+
+ * New upstream version
+ * Added "Multi-Arch: foreign" to python-pdfrw-doc
+
+ [ Lucas Kanashiro ]
+ * Update years of upstream copyright
+ * debian/copyright: use https:// instead of http:// in Format field
+
+ -- Rodrigo Siqueira Thu, 21 Sep 2017 09:55:46 -0300
+
+pdfrw (0.3-1) unstable; urgency=medium
+
+ * New maintainer (Closes: #738298)
+ * New upstream version
+ * Bumped Standards-Version to 4.0.0
+ * Bumped debian/compat to 10
+ * Depend on debhelper >= 10
+ * Added package test with autopkgtests tool
+
+ -- Rodrigo Siqueira Wed, 30 Aug 2017 19:18:45 -0300
+
pdfrw (0.2-3) unstable; urgency=medium
* QA upload.
diff --git a/debian/compat b/debian/compat
index ec63514..f599e28 100644
--- a/debian/compat
+++ b/debian/compat
@@ -1 +1 @@
-9
+10
diff --git a/debian/control b/debian/control
index 00ee686..4c3f8c7 100644
--- a/debian/control
+++ b/debian/control
@@ -1,20 +1,21 @@
Source: pdfrw
Section: python
Priority: optional
-Maintainer: Debian QA Group
+Maintainer: Rodrigo Siqueira
Build-Depends:
- debhelper (>= 9),
+ debhelper (>= 10),
dh-python,
python-all (>= 2.6.6-3~),
python-setuptools,
python3-all,
python3-setuptools,
-Standards-Version: 3.9.8
+Standards-Version: 4.1.3
Homepage: https://github.com/pmaupin/pdfrw
Vcs-Git: https://git.dgit.debian.org/pdfrw
Vcs-Browser: https://browse.dgit.debian.org/pdfrw.git/
X-Python-Version: >= 2.6
X-Python3-Version: >= 3.2
+Testsuite: autopkgtest-pkg-python
Package: python-pdfrw
Architecture: all
@@ -44,6 +45,7 @@ Description: PDF file manipulation library (Python 2)
Package: python-pdfrw-doc
Architecture: all
+Multi-Arch: foreign
Depends:
${misc:Depends},
Section: doc
@@ -72,7 +74,7 @@ Depends:
${python3:Depends},
Suggests:
python-pdfrw-doc,
- python-reportlab,
+ python3-reportlab,
Description: PDF file manipulation library (Python 3)
pdfrw can read and write PDF files, and can also be used to read in PDFs which
can then be used inside reportlab.
diff --git a/debian/copyright b/debian/copyright
index bb4cf16..679ba8f 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -1,10 +1,10 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: pdfrw
Upstream-Contact: Patrick Maupin
Source: https://github.com/pmaupin/pdfrw
Files: *
-Copyright: © 2006-2015 Patrick Maupin
+Copyright: © 2006-2017 Patrick Maupin
© 2010 Attila Tajti
© 2012 Narijus Mika
License: Expat
diff --git a/examples/4up.py b/examples/4up.py
index ad2bd3b..91ac64e 100755
--- a/examples/4up.py
+++ b/examples/4up.py
@@ -27,7 +27,7 @@ def get4(srcpages):
inpfn, = sys.argv[1:]
outfn = '4up.' + os.path.basename(inpfn)
pages = PdfReader(inpfn).pages
-writer = PdfWriter()
+writer = PdfWriter(outfn)
for index in range(0, len(pages), 4):
writer.addpage(get4(pages[index:index + 4]))
-writer.write(outfn)
+writer.write()
diff --git a/examples/README.txt b/examples/README.txt
index 242f5be..5564501 100644
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -6,7 +6,7 @@ alter.py -- Simple example of making a very slight modification to a PDF.
booklet.py -- Converts a PDF into a booklet.
-metadata.py -- Concatenates multiple PDFs, adds metadata.
+cat.py -- Concatenates multiple PDFs, adds metadata.
poster.py -- Changes the size of a PDF to create a poster
diff --git a/examples/alter.py b/examples/alter.py
index 45b9c76..bb236fa 100755
--- a/examples/alter.py
+++ b/examples/alter.py
@@ -19,6 +19,4 @@ outfn = 'alter.' + os.path.basename(inpfn)
trailer = PdfReader(inpfn)
trailer.Info.Title = 'My New Title Goes Here'
-writer = PdfWriter()
-writer.trailer = trailer
-writer.write(outfn)
+PdfWriter(outfn, trailer=trailer).write()
diff --git a/examples/booklet.py b/examples/booklet.py
index 4758b08..e6b523d 100755
--- a/examples/booklet.py
+++ b/examples/booklet.py
@@ -1,16 +1,23 @@
#!/usr/bin/env python
'''
-usage: booklet.py my.pdf
+usage: booklet.py [-p] my.pdf
Creates booklet.my.pdf
Pages organized in a form suitable for booklet printing, e.g.
to print 4 8.5x11 pages using a single 11x17 sheet (double-sided).
+
+The output would be using the same type of sheet
+and you can get up to 3 blank sides if -p is enabled.
+
+Otherwise the two sides in the middle will be in original page size
+and you can have 1 blank sides at most.
+
'''
-import sys
import os
+import argparse
from pdfrw import PdfReader, PdfWriter, PageMerge
@@ -21,13 +28,23 @@ def fixpage(*pages):
return result.render()
-inpfn, = sys.argv[1:]
+parser = argparse.ArgumentParser()
+parser.add_argument("input", help="Input pdf file name")
+parser.add_argument("-p", "--padding", action = "store_true",
+ help="Padding the document so that all pages use the same type of sheet")
+args = parser.parse_args()
+
+inpfn = args.input
outfn = 'booklet.' + os.path.basename(inpfn)
ipages = PdfReader(inpfn).pages
-# Make sure we have an even number
-if len(ipages) & 1:
- ipages.append(None)
+if args.padding:
+ pad_to = 4
+else:
+ pad_to = 2
+
+# Make sure we have a correct number of sides
+ipages += [None]*(-len(ipages)%pad_to)
opages = []
while len(ipages) > 2:
@@ -36,4 +53,4 @@ while len(ipages) > 2:
opages += ipages
-PdfWriter().addpages(opages).write(outfn)
+PdfWriter(outfn).addpages(opages).write()
diff --git a/examples/extract.py b/examples/extract.py
index 3756b4f..dd6e267 100755
--- a/examples/extract.py
+++ b/examples/extract.py
@@ -22,6 +22,6 @@ outfn = 'extract.' + os.path.basename(inpfn)
pages = list(page_per_xobj(PdfReader(inpfn).pages, margin=0.5*72))
if not pages:
raise IndexError("No XObjects found")
-writer = PdfWriter()
+writer = PdfWriter(outfn)
writer.addpages(pages)
-writer.write(outfn)
+writer.write()
diff --git a/examples/fancy_watermark.py b/examples/fancy_watermark.py
new file mode 100755
index 0000000..e9c797d
--- /dev/null
+++ b/examples/fancy_watermark.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+
+'''
+Enhanced example of watermarking using form xobjects (pdfrw).
+
+usage: fancy_watermark.py [-u] my.pdf single_page.pdf
+
+Creates watermark.my.pdf, with every page overlaid with
+first page from single_page.pdf. If -u is selected, watermark
+will be placed underneath page (painted first).
+
+The stock watermark.py program assumes all pages are the same
+size. This example deals with pages of differing sizes in order
+to show some concepts of positioning and scaling.
+
+This version applies the watermark such that the upper right
+corner of the watermark is at the upper right corner of the
+document page for odd pages, and at the upper left corner
+of the document page for even pages, for each page of the
+document.
+
+It also rescales the size of the watermark if the watermark
+is too wide for the page.
+
+These scaling and positioning adjustments can easily
+be customized for any particular application.
+
+To handle documents with different page sizes, a cache is
+maintained of a modified intermediate watermark object
+for each page size.
+'''
+
+import sys
+import os
+
+from pdfrw import PdfReader, PdfWriter, PageMerge
+
+# Get all the filenames
+
+argv = sys.argv[1:]
+underneath = '-u' in argv
+if underneath:
+ del argv[argv.index('-u')]
+inpfn, wmarkfn = argv
+outfn = 'watermark.' + os.path.basename(inpfn)
+
+# Open both the source files
+wmark_trailer = PdfReader(wmarkfn)
+trailer = PdfReader(inpfn)
+
+# Handle different sized pages in same document with
+# a memoization cache, so we don't create more watermark
+# objects than we need to (typically only one per document).
+
+wmark_page = wmark_trailer.pages[0]
+wmark_cache = {}
+
+# Process every page
+for pagenum, page in enumerate(trailer.pages, 1):
+
+ # Get the media box of the page, and see
+ # if we have a matching watermark in the cache
+ mbox = tuple(float(x) for x in page.MediaBox)
+ odd = pagenum & 1
+ key = mbox, odd
+ wmark = wmark_cache.get(key)
+ if wmark is None:
+
+ # Create and cache a new watermark object.
+ wmark = wmark_cache[key] = PageMerge().add(wmark_page)[0]
+
+ # The math is more complete than it probably needs to be,
+ # because the origin of all pages is almost always (0, 0).
+ # Nonetheless, we illustrate all the values and their names.
+
+ page_x, page_y, page_x1, page_y1 = mbox
+ page_w = page_x1 - page_x
+ page_h = page_y1 - page_y # For illustration, not used
+
+ # Scale the watermark if it is too wide for the page
+ # (Could do the same for height instead if needed)
+ if wmark.w > page_w:
+ wmark.scale(1.0 * page_w / wmark.w)
+
+ # Always put watermark at the top of the page
+ # (but see horizontal positioning for other ideas)
+ wmark.y += page_y1 - wmark.h
+
+ # For odd pages, put it at the left of the page,
+ # and for even pages, put it on the right of the page.
+ if odd:
+ wmark.x = page_x
+ else:
+ wmark.x += page_x1 - wmark.w
+
+ # Optimize the case where the watermark is same width
+ # as page.
+ if page_w == wmark.w:
+ wmark_cache[mbox, not odd] = wmark
+
+ # Add the watermark to the page
+ PageMerge(page).add(wmark, prepend=underneath).render()
+
+# Write out the destination file
+PdfWriter(outfn, trailer=trailer).write()
diff --git a/examples/poster.py b/examples/poster.py
index 7f1c1c2..1db9378 100755
--- a/examples/poster.py
+++ b/examples/poster.py
@@ -37,7 +37,7 @@ def adjust(page, margin=36, scale=4.8):
inpfn, = sys.argv[1:]
outfn = 'poster.' + os.path.basename(inpfn)
reader = PdfReader(inpfn)
-writer = PdfWriter()
+writer = PdfWriter(outfn)
writer.addpage(adjust(reader.pages[0]))
writer.trailer.Info = IndirectPdfDict(reader.Info or {})
-writer.write(outfn)
+writer.write()
diff --git a/examples/print_two.py b/examples/print_two.py
index c54eaee..b710192 100755
--- a/examples/print_two.py
+++ b/examples/print_two.py
@@ -29,4 +29,4 @@ def fixpage(page, count=[0]):
inpfn, = sys.argv[1:]
outfn = 'print_two.' + os.path.basename(inpfn)
pages = PdfReader(inpfn).pages
-PdfWriter().addpages(fixpage(x) for x in pages).write(outfn)
+PdfWriter(outfn).addpages(fixpage(x) for x in pages).write()
diff --git a/examples/rl2/decodegraphics.py b/examples/rl2/decodegraphics.py
index e2f3a9f..d26daf7 100644
--- a/examples/rl2/decodegraphics.py
+++ b/examples/rl2/decodegraphics.py
@@ -232,6 +232,19 @@ def parse_text_out(self, token='Tj', params='t'):
text = params[0].decode(self.curfont.remap, self.curfont.twobyte)
self.tpath.textOut(text)
+def parse_lf_text_out(self, token="'", params='t'):
+ self.tpath.textLine()
+ text = params[0].decode(self.curfont.remap, self.curfont.twobyte)
+ self.tpath.textOut(text)
+
+
+def parse_lf_text_out_with_spacing(self, token='"', params='fft'):
+ self.tpath.setWordSpace(params[0])
+ self.tpath.setCharSpace(params[1])
+ self.tpath.textLine()
+ text = params[2].decode(self.curfont.remap, self.curfont.twobyte)
+ self.tpath.textOut(text)
+
def parse_TJ(self, token='TJ', params='a'):
remap = self.curfont.remap
@@ -377,7 +390,7 @@ class _ParseClass(object):
self.gpath = None
self.tpath = None
self.fontdict = dict((x, FontInfo(y)) for
- (x, y) in page.Resources.Font.iteritems())
+ (x, y) in page.Resources.Font.items())
for token in self.tokens:
info = dispatch(token)
@@ -424,7 +437,7 @@ def debugparser(undisturbed=set('parse_array'.split())):
myfunc = oldval[0]
return myfunc, oldval[1]
return dict((x, getvalue(y))
- for (x, y) in _ParseClass.dispatch.iteritems())
+ for (x, y) in _ParseClass.dispatch.items())
class _DebugParse(_ParseClass):
dispatch = debugdispatch()
@@ -435,10 +448,10 @@ parsepage = _ParseClass.parsepage
if __name__ == '__main__':
import sys
- from pdfreader import PdfReader
+ from pdfrw import PdfReader
parse = debugparser()
fname, = sys.argv[1:]
- pdf = PdfReader(fname)
+ pdf = PdfReader(fname, decompress=True)
for i, page in enumerate(pdf.pages):
print ('\nPage %s ------------------------------------' % i)
parse(page)
diff --git a/examples/rotate.py b/examples/rotate.py
index 8b10d05..0115401 100755
--- a/examples/rotate.py
+++ b/examples/rotate.py
@@ -36,6 +36,6 @@ for onerange in ranges:
pages[pagenum].Rotate = (int(pages[pagenum].inheritable.Rotate or
0) + rotate) % 360
-outdata = PdfWriter()
+outdata = PdfWriter(outfn)
outdata.trailer = trailer
-outdata.write(outfn)
+outdata.write()
diff --git a/examples/subset.py b/examples/subset.py
index 30a577a..e965850 100755
--- a/examples/subset.py
+++ b/examples/subset.py
@@ -20,10 +20,10 @@ assert ranges, "Expected at least one range"
ranges = ([int(y) for y in x.split('-')] for x in ranges)
outfn = 'subset.%s' % os.path.basename(inpfn)
pages = PdfReader(inpfn).pages
-outdata = PdfWriter()
+outdata = PdfWriter(outfn)
for onerange in ranges:
onerange = (onerange + onerange[-1:])[:2]
for pagenum in range(onerange[0], onerange[1]+1):
outdata.addpage(pages[pagenum-1])
-outdata.write(outfn)
+outdata.write()
diff --git a/examples/subset_booklets.py b/examples/subset_booklets.py
new file mode 100755
index 0000000..db0b9af
--- /dev/null
+++ b/examples/subset_booklets.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+'''
+usage: subset_booklets.py my.pdf
+
+Creates subset_booklets.my.pdf
+
+Pages organized in a form suitable for booklet printing, e.g.
+to print 4 8.5x11 pages using a single 11x17 sheet (double-sided).
+Instead of a large booklet, the pdf is divided into several mini
+booklets. The reason is: professional printing works this way:
+ - Print all of several mini booklets(subsets of booklet);
+ - Saw each mini booklet individually;
+ - glue them all together;
+ - Insert the cover.
+
+ Take a look at http://www.wikihow.com/Bind-a-Book
+'''
+
+import sys
+import os
+import time
+from pdfrw import PdfReader, PdfWriter, PageMerge
+
+BOOKLET_SIZE = 20
+START = time.time()
+
+def fixpage(*pages):
+ result = PageMerge() + (x for x in pages if x is not None)
+ result[-1].x += result[0].w
+ return result.render()
+
+INPFN, = sys.argv[1:]
+OUTFN = 'booklet.' + os.path.basename(INPFN)
+ALL_IPAGES = PdfReader(INPFN).pages
+print 'The pdf file '+str(INPFN)+' has '+str(len(ALL_IPAGES))+' pages.'
+
+#Make sure we have an even number
+if len(ALL_IPAGES) & 1:
+ ALL_IPAGES.append(None)
+ print 'Inserting one more blank page to make pages number even.'
+NUM_OF_ITER, ITERS_LEFT = divmod(len(ALL_IPAGES), BOOKLET_SIZE)
+
+print 'Making '+str(NUM_OF_ITER)+' subbooklets of '+str(BOOKLET_SIZE)+' pages each.'
+opages = []
+for iteration in range(0, NUM_OF_ITER):
+ ipages = ALL_IPAGES[iteration*BOOKLET_SIZE:(iteration+1)*BOOKLET_SIZE]
+ while len(ipages) > 2:
+ opages.append(fixpage(ipages.pop(), ipages.pop(0)))
+ opages.append(fixpage(ipages.pop(0), ipages.pop()))
+
+# Making one more subbooklet with the left pages
+ipages = ALL_IPAGES[len(ALL_IPAGES)-ITERS_LEFT:len(ALL_IPAGES)]
+while len(ipages) > 2:
+ opages.append(fixpage(ipages.pop(), ipages.pop(0)))
+ opages.append(fixpage(ipages.pop(0), ipages.pop()))
+if len(ipages) >= 1:
+ opages.append(fixpage(ipages.pop(), ipages.pop(0)))
+
+PdfWriter(OUTFN).addpages(opages).write()
+print 'It took '+ str(round(time.time()-START, 2))+' seconds to make the pdf subbooklets changes.'
diff --git a/examples/unspread.py b/examples/unspread.py
index 4b3bc5d..4caa973 100755
--- a/examples/unspread.py
+++ b/examples/unspread.py
@@ -26,7 +26,7 @@ def splitpage(src):
inpfn, = sys.argv[1:]
outfn = 'unspread.' + os.path.basename(inpfn)
-writer = PdfWriter()
+writer = PdfWriter(outfn)
for page in PdfReader(inpfn).pages:
writer.addpages(splitpage(page))
-writer.write(outfn)
+writer.write()
diff --git a/examples/watermark.py b/examples/watermark.py
index 96b686b..1188502 100755
--- a/examples/watermark.py
+++ b/examples/watermark.py
@@ -9,10 +9,14 @@ Creates watermark.my.pdf, with every page overlaid with
first page from single_page.pdf. If -u is selected, watermark
will be placed underneath page (painted first).
-NB: At one point, this example was extremely complicated, with
- multiple options. That only led to errors in implementation,
- so it has been re-simplified in order to show basic principles
- of the library operation and to match the other examples better.
+NOTE 1: This program assumes that all pages (including the watermark
+ page) are the same size. For other possibilities, see
+ the fancy_watermark.py example.
+
+NOTE 2: At one point, this example was extremely complicated, with
+ multiple options. That only led to errors in implementation,
+ so it has been re-simplified in order to show basic principles
+ of the library operation and to match the other examples better.
'''
import sys
@@ -30,4 +34,4 @@ wmark = PageMerge().add(PdfReader(wmarkfn).pages[0])[0]
trailer = PdfReader(inpfn)
for page in trailer.pages:
PageMerge(page).add(wmark, prepend=underneath).render()
-PdfWriter().write(outfn, trailer)
+PdfWriter(outfn, trailer=trailer).write()
diff --git a/pdfrw/__init__.py b/pdfrw/__init__.py
index a36a8cb..cf7644a 100644
--- a/pdfrw/__init__.py
+++ b/pdfrw/__init__.py
@@ -10,13 +10,14 @@ from .tokens import PdfTokens
from .errors import PdfParseError
from .pagemerge import PageMerge
-__version__ = '0.2'
+__version__ = '0.4'
# Add a tiny bit of compatibility to pyPdf
PdfFileReader = PdfReader
PdfFileWriter = PdfWriter
-__all__ = [PdfWriter, PdfReader, PdfObject, PdfName, PdfArray,
- PdfTokens, PdfParseError, PdfDict, IndirectPdfDict,
- PdfString, PageMerge]
+__all__ = """PdfWriter PdfReader PdfObject PdfName PdfArray
+ PdfTokens PdfParseError PdfDict IndirectPdfDict
+ PdfString PageMerge""".split()
+
diff --git a/pdfrw/buildxobj.py b/pdfrw/buildxobj.py
index d210c67..f132795 100644
--- a/pdfrw/buildxobj.py
+++ b/pdfrw/buildxobj.py
@@ -32,6 +32,8 @@ from .objects import PdfDict, PdfArray, PdfName
from .pdfreader import PdfReader
from .errors import log, PdfNotImplementedError
from .py23_diffs import iteritems
+from .uncompress import uncompress
+from .compress import compress
class ViewInfo(object):
@@ -169,6 +171,10 @@ def _build_cache(contents, allow_compressed):
and save it along with private cache info.
Assumes validity has been pre-checked if
we have a non-None xobj_copy.
+
+ Also, the spec says nothing about nested arrays,
+ so we assume those don't exist until we see one
+ in the wild.
'''
try:
xobj_copy = contents.xobj_copy
@@ -183,9 +189,20 @@ def _build_cache(contents, allow_compressed):
array = [contents]
private = contents.private
- # The spec says nothing about nested arrays. Will
- # assume that's not a problem until we encounter them...
+ # If we don't allow compressed objects, OR if we have multiple compressed
+ # objects, we try to decompress them, and fail if we cannot do that.
+ if not allow_compressed or len(array) > 1:
+ keys = set(x[0] for cdict in array for x in iteritems(cdict))
+ was_compressed = len(keys) > 1
+ if was_compressed:
+ # Make copies of the objects before we uncompress them.
+ array = [PdfDict(x) for x in array]
+ if not uncompress(array):
+ raise PdfNotImplementedError(
+ 'Xobjects with these compression parameters not supported: %s' %
+ keys)
+
xobj_copy = PdfDict(array[0])
xobj_copy.private.xobj_cachedict = {}
private.xobj_copy = xobj_copy
@@ -195,19 +212,9 @@ def _build_cache(contents, allow_compressed):
newlength = sum(int(x.Length) for x in array) + len(array) - 1
assert newlength == len(newstream)
xobj_copy.stream = newstream
+ if was_compressed and allow_compressed:
+ compress(xobj_copy)
- # Cannot currently cope with different kinds of
- # compression in the array, so just disallow it.
- allow_compressed = False
-
- if not allow_compressed:
- # Make sure there are no compression parameters
- for cdict in array:
- keys = [x[0] for x in iteritems(cdict)]
- if len(keys) != 1:
- raise PdfNotImplementedError(
- 'Xobjects with compression parameters not supported: %s' %
- keys)
return xobj_copy
diff --git a/pdfrw/compress.py b/pdfrw/compress.py
index 0479131..b7b4e75 100644
--- a/pdfrw/compress.py
+++ b/pdfrw/compress.py
@@ -3,14 +3,14 @@
# MIT license -- See LICENSE.txt for details
'''
-Currently, this sad little file only knows how to decompress
+Currently, this sad little file only knows how to compress
using the flate (zlib) algorithm. Maybe more later, but it's
not a priority for me...
'''
from .objects import PdfName
from .uncompress import streamobjects
-from .py23_diffs import zlib
+from .py23_diffs import zlib, convert_load, convert_store
def compress(mylist):
@@ -20,7 +20,7 @@ def compress(mylist):
if ftype is not None:
continue
oldstr = obj.stream
- newstr = zlib.compress(oldstr)
+ newstr = convert_load(zlib.compress(convert_store(oldstr)))
if len(newstr) < len(oldstr) + 30:
obj.stream = newstr
obj.Filter = flate
diff --git a/pdfrw/crypt.py b/pdfrw/crypt.py
new file mode 100644
index 0000000..dc00676
--- /dev/null
+++ b/pdfrw/crypt.py
@@ -0,0 +1,150 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2017 Jon Lund Steffensen
+# MIT license -- See LICENSE.txt for details
+
+from __future__ import division
+
+import hashlib
+import struct
+
+try:
+ from Crypto.Cipher import ARC4, AES
+ HAS_CRYPTO = True
+except ImportError:
+ HAS_CRYPTO = False
+
+from .objects import PdfDict, PdfName
+
+_PASSWORD_PAD = (
+ '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08'
+ '..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz')
+
+
+def streamobjects(mylist, isinstance=isinstance, PdfDict=PdfDict):
+ for obj in mylist:
+ if isinstance(obj, PdfDict) and obj.stream is not None:
+ yield obj
+
+
+def create_key(password, doc):
+ """Create an encryption key (Algorithm 2 in PDF spec)."""
+ key_size = int(doc.Encrypt.Length or 40) // 8
+ padded_pass = (password + _PASSWORD_PAD)[:32]
+ hasher = hashlib.md5()
+ hasher.update(padded_pass)
+ hasher.update(doc.Encrypt.O.to_bytes())
+ hasher.update(struct.pack('= 3:
+ for _ in range(50):
+ temp_hash = hashlib.md5(temp_hash[:key_size]).digest()
+
+ return temp_hash[:key_size]
+
+
+def create_user_hash(key, doc):
+ """Create the user password hash (Algorithm 4/5)."""
+ revision = int(doc.Encrypt.R or 0)
+ if revision < 3:
+ cipher = ARC4.new(key)
+ return cipher.encrypt(_PASSWORD_PAD)
+ else:
+ hasher = hashlib.md5()
+ hasher.update(_PASSWORD_PAD)
+ hasher.update(doc.ID[0].to_bytes())
+ temp_hash = hasher.digest()
+
+ for i in range(20):
+ temp_key = ''.join(chr(i ^ ord(x)) for x in key)
+ cipher = ARC4.new(temp_key)
+ temp_hash = cipher.encrypt(temp_hash)
+
+ return temp_hash
+
+
+def check_user_password(key, doc):
+ """Check that the user password is correct (Algorithm 6)."""
+ expect_user_hash = create_user_hash(key, doc)
+ revision = int(doc.Encrypt.R or 0)
+ if revision < 3:
+ return doc.Encrypt.U.to_bytes() == expect_user_hash
+ else:
+ return doc.Encrypt.U.to_bytes()[:16] == expect_user_hash
+
+
+class AESCryptFilter(object):
+ """Crypt filter corresponding to /AESV2."""
+ def __init__(self, key):
+ self._key = key
+
+ def decrypt_data(self, num, gen, data):
+ """Decrypt data (string/stream) using key (Algorithm 1)."""
+ key_extension = struct.pack('= 1 and ftype[0] == PdfName.Crypt:
+ ftype = ftype[1:]
+ parms = obj.DecodeParms or obj.DP
+ filter = filters[parms.Name]
+
+ num, gen = obj.indirect
+ obj.stream = filter.decrypt_data(num, gen, obj.stream)
+ obj.private.decrypted = True
+ obj.Filter = ftype or None
diff --git a/pdfrw/errors.py b/pdfrw/errors.py
index 263cd4d..ef6ab7d 100644
--- a/pdfrw/errors.py
+++ b/pdfrw/errors.py
@@ -9,11 +9,14 @@ PDF Exceptions and error handling
import logging
-logging.basicConfig(
- format='[%(levelname)s] %(filename)s:%(lineno)d %(message)s',
- level=logging.WARNING)
+fmt = logging.Formatter('[%(levelname)s] %(filename)s:%(lineno)d %(message)s')
+
+handler = logging.StreamHandler()
+handler.setFormatter(fmt)
log = logging.getLogger('pdfrw')
+log.setLevel(logging.WARNING)
+log.addHandler(handler)
class PdfError(Exception):
diff --git a/pdfrw/findobjs.py b/pdfrw/findobjs.py
index f19ebdf..67d33a0 100644
--- a/pdfrw/findobjs.py
+++ b/pdfrw/findobjs.py
@@ -8,7 +8,6 @@
'''
from .objects import PdfDict, PdfArray, PdfName
-from .pdfwriter import user_fmt
def find_objects(source, valid_types=(PdfName.XObject, None),
@@ -81,7 +80,7 @@ def wrap_object(obj, width, margin):
iw, ih = float(obj.Width), float(obj.Height)
ch = 1.0 * cw / iw * ih
height = ch + margin[1] + margin[3]
- p = tuple(user_fmt(x) for x in (cw, ch, xoffset, yoffset))
+ p = tuple(('%.9f' % x).rstrip('0').rstrip('.') for x in (cw, ch, xoffset, yoffset))
contents.stream = fmt % p
resources = PdfDict(XObject=PdfDict(MyImage=obj))
mbox = PdfArray((0, 0, width, height))
diff --git a/pdfrw/objects/__init__.py b/pdfrw/objects/__init__.py
index 1746dfe..879e0ef 100644
--- a/pdfrw/objects/__init__.py
+++ b/pdfrw/objects/__init__.py
@@ -15,5 +15,5 @@ from .pdfobject import PdfObject
from .pdfstring import PdfString
from .pdfindirect import PdfIndirect
-__all__ = [PdfName, PdfDict, IndirectPdfDict, PdfArray,
- PdfObject, PdfString, PdfIndirect]
+__all__ = """PdfName PdfDict IndirectPdfDict PdfArray
+ PdfObject PdfString PdfIndirect""".split()
diff --git a/pdfrw/objects/pdfarray.py b/pdfrw/objects/pdfarray.py
index b662755..e15f4ad 100644
--- a/pdfrw/objects/pdfarray.py
+++ b/pdfrw/objects/pdfarray.py
@@ -65,3 +65,7 @@ class PdfArray(list):
def pop(self, *args):
self._resolve()
return list.pop(self, *args)
+
+ def __reversed__(self):
+ self._resolve()
+ return list.__reversed__(self)
diff --git a/pdfrw/objects/pdfdict.py b/pdfrw/objects/pdfdict.py
index fc28492..0fdf75b 100644
--- a/pdfrw/objects/pdfdict.py
+++ b/pdfrw/objects/pdfdict.py
@@ -136,7 +136,15 @@ class PdfDict(dict):
'''
value = dictget(self, key)
if isinstance(value, PdfIndirect):
- self[key] = value = value.real_value()
+ # We used to use self[key] here, but that does an
+ # unwanted check on the type of the key (github issue #98).
+ # Python will keep the old key object in the dictionary,
+ # so that check is not necessary.
+ value = value.real_value()
+ if value is not None:
+ dict.__setitem__(self, key, value)
+ else:
+ del self[name]
return value
def __getitem__(self, key):
diff --git a/pdfrw/objects/pdfname.py b/pdfrw/objects/pdfname.py
index 1fdf5b5..28a1464 100644
--- a/pdfrw/objects/pdfname.py
+++ b/pdfrw/objects/pdfname.py
@@ -23,6 +23,7 @@ class BasePdfName(str):
'''
indirect = False
+ encoded = None
whitespace = '\x00 \t\f\r\n'
delimiters = '()<>{}[]/%'
diff --git a/pdfrw/objects/pdfstring.py b/pdfrw/objects/pdfstring.py
index 5c35d70..906f30e 100644
--- a/pdfrw/objects/pdfstring.py
+++ b/pdfrw/objects/pdfstring.py
@@ -1,74 +1,553 @@
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
-# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas
+# 2016 James Laird-Wah, Sydney, Australia
# MIT license -- See LICENSE.txt for details
-import re
+"""
+================================
+PdfString encoding and decoding
+================================
+
+Introduction
+=============
+
+
+This module handles encoding and decoding of PDF strings. PDF strings
+are described in the PDF 1.7 reference manual, mostly in chapter 3
+(sections 3.2 and 3.8) and chapter 5.
+
+PDF strings are used in the document structure itself, and also inside
+the stream of page contents dictionaries.
+
+A PDF string can represent pure binary data (e.g. for a font or an
+image), or text, or glyph indices. For Western fonts, the glyph indices
+usually correspond to ASCII, but that is not guaranteed. (When it does
+happen, it makes examination of raw PDF data a lot easier.)
+
+The specification defines PDF string encoding at two different levels.
+At the bottom, it defines ways to encode arbitrary bytes so that a PDF
+tokenizer can understand they are a string of some sort, and can figure
+out where the string begins and ends. (That is all the tokenizer itself
+cares about.) Above that level, if the string represents text, the
+specification defines ways to encode Unicode text into raw bytes, before
+the byte encoding is performed.
+
+There are two ways to do the byte encoding, and two ways to do the text
+(Unicode) encoding.
+
+Encoding bytes into PDF strings
+================================
+
+Adobe calls the two ways to encode bytes into strings "Literal strings"
+and "Hexadecimal strings."
+
+Literal strings
+------------------
+
+A literal string is delimited by ASCII parentheses ("(" and ")"), and a
+hexadecimal string is delimited by ASCII less-than and greater-than
+signs ("<" and ">").
+
+A literal string may encode bytes almost unmolested. The caveat is
+that if a byte has the same value as a parenthesis, it must be escaped
+so that the tokenizer knows the string is not finished. This is accomplished
+by using the ASCII backslash ("\") as an escape character. Of course,
+now any backslash appearing in the data must likewise be escaped.
+
+Hexadecimal strings
+---------------------
+
+A hexadecimal string requires twice as much space as the source data
+it represents (plus two bytes for the delimiter), simply storing each
+byte as two hexadecimal digits, most significant digit first. The spec
+allows for lower or upper case hex digits, but most PDF encoders seem
+to use upper case.
+
+Special cases -- Legacy systems and readability
+-----------------------------------------------
+
+It is possible to create a PDF document that uses 7 bit ASCII encoding,
+and it is desirable in many cases to create PDFs that are reasonably
+readable when opened in a text editor. For these reasons, the syntax
+for both literal strings and hexadecimal strings is slightly more
+complicated that the initial description above. In general, the additional
+syntax allows the following features:
+
+ - Making the delineation between characters, or between sections of
+ a string, apparent, and easy to see in an editor.
+ - Keeping output lines from getting too wide for some editors
+ - Keeping output lines from being so narrow that you can only see the
+ small fraction of a string at a time in an editor.
+ - Suppressing unprintable characters
+ - Restricting the output string to 7 bit ASCII
+
+Hexadecimal readability
+~~~~~~~~~~~~~~~~~~~~~~~
+
+For hexadecimal strings, only the first two bullets are relevant. The syntax
+to accomplish this is simple, allowing any ASCII whitespace to be inserted
+anywhere in the encoded hex string.
+
+Literal readability
+~~~~~~~~~~~~~~~~~~~
+
+For literal strings, all of the bullets except the first are relevant.
+The syntax has two methods to help with these goals. The first method
+is to overload the escape operator to be able to do different functions,
+and the second method can reduce the number of escapes required for
+parentheses in the normal case.
+
+The escape function works differently, depending on what byte follows
+the backslash. In all cases, the escaping backslash is discarded,
+and then the next character is examined:
+
+ - For parentheses and backslashes (and, in fact, for all characters
+ not described otherwise in this list), the character after the
+ backslash is preserved in the output.
+ - A letter from the set of "nrtbf" following a backslash is interpreted as
+ a line feed, carriage return, tab, backspace, or form-feed, respectively.
+ - One to three octal digits following the backslash indicate the
+ numeric value of the encoded byte.
+ - A carriage return, carriage return/line feed, or line feed following
+ the backslash indicates a line break that was put in for readability,
+ and that is not part of the actual data, so this is discarded.
+
+The second method that can be used to improve readability (and reduce space)
+in literal strings is to not escape parentheses. This only works, and is
+only allowed, when the parentheses are properly balanced. For example,
+"((Hello))" is a valid encoding for a literal string, but "((Hello)" is not;
+the latter case should be encoded "(\(Hello)"
+
+Encoding text into strings
+==========================
+
+Section 3.8.1 of the PDF specification describes text strings.
+
+The individual characters of a text string can all be considered to
+be Unicode; Adobe specifies two different ways to encode these characters
+into a string of bytes before further encoding the byte string as a
+literal string or a hexadecimal string.
+
+The first way to encode these strings is called PDFDocEncoding. This
+is mostly a one-for-one mapping of bytes into single bytes, similar to
+Latin-1. The representable character set is limited to the number of
+characters that can fit in a byte, and this encoding cannot be used
+with Unicode strings that start with the two characters making up the
+UTF-16-BE BOM.
+
+The second way to encode these strings is with UTF-16-BE. Text strings
+encoded with this method must start with the BOM, and although the spec
+does not appear to mandate that the resultant bytes be encoded into a
+hexadecimal string, that seems to be the canonical way to do it.
+
+When encoding a string into UTF-16-BE, this module always adds the BOM,
+and when decoding a string from UTF-16-BE, this module always strips
+the BOM. If a source string contains a BOM, that will remain in the
+final string after a round-trip through the encoder and decoder, as
+the goal of the encoding/decoding process is transparency.
+
+
+PDF string handling in pdfrw
+=============================
+
+Responsibility for handling PDF strings in the pdfrw library is shared
+between this module, the tokenizer, and the pdfwriter.
+
+tokenizer string handling
+--------------------------
+
+As far as the tokenizer and its clients such as the pdfreader are concerned,
+the PdfString class must simply be something that it can instantiate by
+passing a string, that doesn't compare equal (or throw an exception when
+compared) to other possible token strings. The tokenizer must understand
+enough about the syntax of the string to successfully find its beginning
+and end in a stream of tokens, but doesn't otherwise know or care about
+the data represented by the string.
+
+pdfwriter string handling
+--------------------------
+
+The pdfwriter knows and cares about two attributes of PdfString instances:
+
+ - First, PdfString objects have an 'indirect' attribute, which pdfwriter
+ uses as an indication that the object knows how to represent itself
+ correctly when output to a new PDF. (In the case of a PdfString object,
+ no work is really required, because it is already a string.)
+ - Second, the PdfString.encode() method is used as a convenience to
+ automatically convert any user-supplied strings (that didn't come
+ from PDFs) when a PDF is written out to a file.
+
+pdfstring handling
+-------------------
+
+The code in this module is designed to support those uses by the
+tokenizer and the pdfwriter, and to additionally support encoding
+and decoding of PdfString objects as a convenience for the user.
+
+Most users of the pdfrw library never encode or decode a PdfString,
+so it is imperative that (a) merely importing this module does not
+take a significant amount of CPU time; and (b) it is cheap for the
+tokenizer to produce a PdfString, and cheap for the pdfwriter to
+consume a PdfString -- if the tokenizer finds a string that conforms
+to the PDF specification, it will be wrapped in a PdfString object,
+and if the pdfwriter finds an object with an indirect attribute, it
+simply calls str() to ask it to format itself.
+
+Encoding and decoding are not actually performed very often at all,
+compared to how often tokenization and then subsequent concatenation
+by the pdfwriter are performed. In fact, versions of pdfrw prior to
+0.4 did not even support Unicode for this function. Encoding and
+decoding can also easily be performed by the user, outside of the
+library, and this might still be recommended, at least for encoding,
+if the visual appeal of encodings generated by this module is found
+lacking.
+
+
+Decoding strings
+~~~~~~~~~~~~~~~~~~~
+
+Decoding strings can be tricky, but is a bounded process. Each
+properly-encoded encoded string represents exactly one output string,
+with the caveat that is up to the caller of the function to know whether
+he expects a Unicode string, or just bytes.
+
+The caller can call PdfString.to_bytes() to get a byte string (which may
+or may not represent encoded Unicode), or may call PdfString.to_unicode()
+to get a Unicode string. Byte strings will be regular strings in Python 2,
+and b'' bytes in Python 3; Unicode strings will be regular strings in
+Python 3, and u'' unicode strings in Python 2.
+
+To maintain application compatibility with earlier versions of pdfrw,
+PdfString.decode() is an alias for PdfString.to_unicode().
+
+Encoding strings
+~~~~~~~~~~~~~~~~~~
+
+PdfString has three factory functions that will encode strings into
+PdfString objects:
+
+ - PdfString.from_bytes() accepts a byte string (regular string in Python 2
+ or b'' bytes string in Python 3) and returns a PdfString object.
+ - PdfString.from_unicode() accepts a Unicode string (u'' Unicode string in
+ Python 2 or regular string in Python 3) and returns a PdfString object.
+ - PdfString.encode() examines the type of object passed, and either
+ calls from_bytes() or from_unicode() to do the real work.
+
+Unlike decoding(), encoding is not (mathematically) a function.
+There are (literally) an infinite number of ways to encode any given
+source string. (Of course, most of them would be stupid, unless
+the intent is some sort of denial-of-service attack.)
+
+So encoding strings is either simpler than decoding, or can be made to
+be an open-ended science fair project (to create the best looking
+encoded strings).
+
+There are parameters to the encoding functions that allow control over
+the final encoded string, but the intention is to make the default values
+produce a reasonable encoding.
+
+As mentioned previously, if encoding does not do what a particular
+user needs, that user is free to write his own encoder, and then
+simply instantiate a PdfString object by passing a string to the
+default constructor, the same way that the tokenizer does it.
+
+However, if desirable, encoding may gradually become more capable
+over time, adding the ability to generate more aesthetically pleasing
+encoded strings.
+
+PDFDocString encoding and decoding
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To handle this encoding in a fairly standard way, this module registers
+an encoder and decoder for PDFDocEncoding with the codecs module.
+
+"""
+
+import re
+import codecs
+import binascii
+import itertools
+from ..py23_diffs import convert_load, convert_store
+
+def find_pdfdocencoding(encoding):
+ """ This function conforms to the codec module registration
+ protocol. It defers calculating data structures until
+ a pdfdocencoding encode or decode is required.
+
+ PDFDocEncoding is described in the PDF 1.7 reference manual.
+ """
+
+ if encoding != 'pdfdocencoding':
+ return
+
+ # Create the decoding map based on the table in section D.2 of the
+ # PDF 1.7 manual
+
+ # Start off with the characters with 1:1 correspondence
+ decoding_map = set(range(0x20, 0x7F)) | set(range(0xA1, 0x100))
+ decoding_map.update((0x09, 0x0A, 0x0D))
+ decoding_map.remove(0xAD)
+ decoding_map = dict((x, x) for x in decoding_map)
+
+ # Add in the special Unicode characters
+ decoding_map.update(zip(range(0x18, 0x20), (
+ 0x02D8, 0x02C7, 0x02C6, 0x02D9, 0x02DD, 0x02DB, 0x02DA, 0x02DC)))
+ decoding_map.update(zip(range(0x80, 0x9F), (
+ 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
+ 0x2039, 0x203A, 0x2212, 0x2030, 0x201E, 0x201C, 0x201D, 0x2018,
+ 0x2019, 0x201A, 0x2122, 0xFB01, 0xFB02, 0x0141, 0x0152, 0x0160,
+ 0x0178, 0x017D, 0x0131, 0x0142, 0x0153, 0x0161, 0x017E)))
+ decoding_map[0xA0] = 0x20AC
+
+ # Make the encoding map from the decoding map
+ encoding_map = codecs.make_encoding_map(decoding_map)
+
+ # Not every PDF producer follows the spec, so conform to Postel's law
+ # and interpret encoded strings if at all possible. In particular, they
+ # might have nulls and form-feeds, judging by random code snippets
+ # floating around the internet.
+ decoding_map.update(((x, x) for x in range(0x18)))
+
+ def encode(input, errors='strict'):
+ return codecs.charmap_encode(input, errors, encoding_map)
+
+ def decode(input, errors='strict'):
+ return codecs.charmap_decode(input, errors, decoding_map)
+
+ return codecs.CodecInfo(encode, decode, name='pdfdocencoding')
+
+codecs.register(find_pdfdocencoding)
class PdfString(str):
- ''' A PdfString is an encoded string. It has a decode
+ """ A PdfString is an encoded string. It has a decode
method to get the actual string data out, and there
is an encode class method to create such a string.
Like any PDF object, it could be indirect, but it
defaults to being a direct object.
- '''
+ """
indirect = False
- unescape_dict = {'\\b': '\b', '\\f': '\f', '\\n': '\n',
- '\\r': '\r', '\\t': '\t',
- '\\\r\n': '', '\\\r': '', '\\\n': '',
- '\\\\': '\\', '\\': '',
- }
- unescape_pattern = (r'(\\\\|\\b|\\f|\\n|\\r|\\t'
- r'|\\\r\n|\\\r|\\\n|\\[0-9]+|\\)')
- unescape_func = re.compile(unescape_pattern).split
- hex_pattern = '([a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])'
- hex_func = re.compile(hex_pattern).split
- hex_pattern2 = ('([a-fA-F0-9][a-fA-F0-9][a-fA-F0-9][a-fA-F0-9]|'
- '[a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])')
- hex_func2 = re.compile(hex_pattern2).split
+ # The byte order mark, and unicode that could be
+ # wrongly encoded into the byte order mark by the
+ # pdfdocencoding codec.
- hex_funcs = hex_func, hex_func2
+ bytes_bom = codecs.BOM_UTF16_BE
+ bad_pdfdoc_prefix = bytes_bom.decode('latin-1')
- def decode_regular(self, remap=chr):
- assert self[0] == '(' and self[-1] == ')'
- mylist = self.unescape_func(self[1:-1])
- result = []
- unescape = self.unescape_dict.get
- for chunk in mylist:
- chunk = unescape(chunk, chunk)
- if chunk.startswith('\\') and len(chunk) > 1:
- value = int(chunk[1:], 8)
- # FIXME: TODO: Handle unicode here
- if value > 127:
- value = 127
- chunk = remap(value)
- if chunk:
- result.append(chunk)
- return ''.join(result)
+ # Used by decode_literal; filled in on first use
- def decode_hex(self, remap=chr, twobytes=False):
- data = ''.join(self.split())
- data = self.hex_funcs[twobytes](data)
- chars = data[1::2]
- other = data[0::2]
- assert (other[0] == '<' and
- other[-1] == '>' and
- ''.join(other) == '<>'), self
- return ''.join([remap(int(x, 16)) for x in chars])
+ unescape_dict = None
+ unescape_func = None
- def decode(self, remap=chr, twobytes=False):
- if self.startswith('('):
- return self.decode_regular(remap)
+ @classmethod
+ def init_unescapes(cls):
+ """ Sets up the unescape attributes for decode_literal
+ """
+ unescape_pattern = r'\\([0-7]{1,3}|\r\n|.)'
+ unescape_func = re.compile(unescape_pattern, re.DOTALL).split
+ cls.unescape_func = unescape_func
+
+ unescape_dict = dict(((chr(x), chr(x)) for x in range(0x100)))
+ unescape_dict.update(zip('nrtbf', '\n\r\t\b\f'))
+ unescape_dict['\r'] = ''
+ unescape_dict['\n'] = ''
+ unescape_dict['\r\n'] = ''
+ for i in range(0o10):
+ unescape_dict['%01o' % i] = chr(i)
+ for i in range(0o100):
+ unescape_dict['%02o' % i] = chr(i)
+ for i in range(0o400):
+ unescape_dict['%03o' % i] = chr(i)
+ cls.unescape_dict = unescape_dict
+ return unescape_func
+
+ def decode_literal(self):
+ """ Decode a PDF literal string, which is enclosed in parentheses ()
+
+ Many pdfrw users never decode strings, so defer creating
+ data structures to do so until the first string is decoded.
+
+ Possible string escapes from the spec:
+ (PDF 1.7 Reference, section 3.2.3, page 53)
+
+ 1. \[nrtbf\()]: simple escapes
+ 2. \\d{1,3}: octal. Must be zero-padded to 3 digits
+ if followed by digit
+ 3. \: line continuation. We don't know the EOL
+ marker used in the PDF, so accept \r, \n, and \r\n.
+ 4. Any other character following \ escape -- the backslash
+ is swallowed.
+ """
+ result = (self.unescape_func or self.init_unescapes())(self[1:-1])
+ if len(result) == 1:
+ return convert_store(result[0])
+ unescape_dict = self.unescape_dict
+ result[1::2] = [unescape_dict[x] for x in result[1::2]]
+ return convert_store(''.join(result))
+
+
+ def decode_hex(self):
+ """ Decode a PDF hexadecimal-encoded string, which is enclosed
+ in angle brackets <>.
+ """
+ hexstr = convert_store(''.join(self[1:-1].split()))
+ if len(hexstr) % 1: # odd number of chars indicates a truncated 0
+ hexstr += '0'
+ return binascii.unhexlify(hexstr)
+
+
+ def to_bytes(self):
+ """ Decode a PDF string to bytes. This is a convenience function
+ for user code, in that (as of pdfrw 0.3) it is never
+ actually used inside pdfrw.
+ """
+ if self.startswith('(') and self.endswith(')'):
+ return self.decode_literal()
+
+ elif self.startswith('<') and self.endswith('>'):
+ return self.decode_hex()
else:
- return self.decode_hex(remap, twobytes)
+ raise ValueError('Invalid PDF string "%s"' % repr(self))
- def encode(cls, source, usehex=False):
- assert not usehex, "Not supported yet"
- source = source.replace('\\', '\\\\')
- source = source.replace('(', '\\(')
- source = source.replace(')', '\\)')
- return cls('(' + source + ')')
- encode = classmethod(encode)
+ def to_unicode(self):
+ """ Decode a PDF string to a unicode string. This is a
+ convenience function for user code, in that (as of
+ pdfrw 0.3) it is never actually used inside pdfrw.
+
+ There are two Unicode storage methods used -- either
+ UTF16_BE, or something called PDFDocEncoding, which
+ is defined in the PDF spec. The determination of
+ which decoding method to use is done by examining the
+ first two bytes for the byte order marker.
+ """
+ raw = self.to_bytes()
+
+ if raw[:2] == self.bytes_bom:
+ return raw[2:].decode('utf-16-be')
+ else:
+ return raw.decode('pdfdocencoding')
+
+ # Legacy-compatible interface
+ decode = to_unicode
+
+ # Internal value used by encoding
+
+ escape_splitter = None # Calculated on first use
+
+ @classmethod
+ def init_escapes(cls):
+ """ Initialize the escape_splitter for the encode method
+ """
+ cls.escape_splitter = re.compile(br'(\(|\\|\))').split
+ return cls.escape_splitter
+
+ @classmethod
+ def from_bytes(cls, raw, bytes_encoding='auto'):
+ """ The from_bytes() constructor is called to encode a source raw
+ byte string into a PdfString that is suitable for inclusion
+ in a PDF.
+
+ NOTE: There is no magic in the encoding process. A user
+ can certainly do his own encoding, and simply initialize a
+ PdfString() instance with his encoded string. That may be
+ useful, for example, to add line breaks to make it easier
+ to load PDFs into editors, or to not bother to escape balanced
+ parentheses, or to escape additional characters to make a PDF
+ more readable in a file editor. Those are features not
+ currently supported by this method.
+
+ from_bytes() can use a heuristic to figure out the best
+ encoding for the string, or the user can control the process
+ by changing the bytes_encoding parameter to 'literal' or 'hex'
+ to force a particular conversion method.
+ """
+
+ # If hexadecimal is not being forced, then figure out how long
+ # the escaped literal string will be, and fall back to hex if
+ # it is too long.
+
+ force_hex = bytes_encoding == 'hex'
+ if not force_hex:
+ if bytes_encoding not in ('literal', 'auto'):
+ raise ValueError('Invalid bytes_encoding value: %s'
+ % bytes_encoding)
+ splitlist = (cls.escape_splitter or cls.init_escapes())(raw)
+ if bytes_encoding == 'auto' and len(splitlist) // 2 >= len(raw):
+ force_hex = True
+
+ if force_hex:
+ # The spec does not mandate uppercase,
+ # but it seems to be the convention.
+ fmt = '<%s>'
+ result = binascii.hexlify(raw).upper()
+ else:
+ fmt = '(%s)'
+ splitlist[1::2] = [(b'\\' + x) for x in splitlist[1::2]]
+ result = b''.join(splitlist)
+
+ return cls(fmt % convert_load(result))
+
+ @classmethod
+ def from_unicode(cls, source, text_encoding='auto',
+ bytes_encoding='auto'):
+ """ The from_unicode() constructor is called to encode a source
+ string into a PdfString that is suitable for inclusion in a PDF.
+
+ NOTE: There is no magic in the encoding process. A user
+ can certainly do his own encoding, and simply initialize a
+ PdfString() instance with his encoded string. That may be
+ useful, for example, to add line breaks to make it easier
+ to load PDFs into editors, or to not bother to escape balanced
+ parentheses, or to escape additional characters to make a PDF
+ more readable in a file editor. Those are features not
+ supported by this method.
+
+ from_unicode() can use a heuristic to figure out the best
+ encoding for the string, or the user can control the process
+ by changing the text_encoding parameter to 'pdfdocencoding'
+ or 'utf16', and/or by changing the bytes_encoding parameter
+ to 'literal' or 'hex' to force particular conversion methods.
+
+ The function will raise an exception if it cannot perform
+ the conversion as requested by the user.
+ """
+
+ # Give preference to pdfdocencoding, since it only
+ # requires one raw byte per character, rather than two.
+ if text_encoding != 'utf16':
+ force_pdfdoc = text_encoding == 'pdfdocencoding'
+ if text_encoding != 'auto' and not force_pdfdoc:
+ raise ValueError('Invalid text_encoding value: %s'
+ % text_encoding)
+
+ if source.startswith(cls.bad_pdfdoc_prefix):
+ if force_pdfdoc:
+ raise UnicodeError('Prefix of string %r cannot be encoded '
+ 'in pdfdocencoding' % source[:20])
+ else:
+ try:
+ raw = source.encode('pdfdocencoding')
+ except UnicodeError:
+ if force_pdfdoc:
+ raise
+ else:
+ return cls.from_bytes(raw, bytes_encoding)
+
+ # If the user is not forcing literal strings,
+ # it makes much more sense to use hexadecimal with 2-byte chars
+ raw = cls.bytes_bom + source.encode('utf-16-be')
+ encoding = 'hex' if bytes_encoding == 'auto' else bytes_encoding
+ return cls.from_bytes(raw, encoding)
+
+ @classmethod
+ def encode(cls, source, uni_type = type(u''), isinstance=isinstance):
+ """ The encode() constructor is a legacy function that is
+ also a convenience for the PdfWriter.
+ """
+ if isinstance(source, uni_type):
+ return cls.from_unicode(source)
+ else:
+ return cls.from_bytes(source)
diff --git a/pdfrw/pagemerge.py b/pdfrw/pagemerge.py
index de1d69b..4555110 100644
--- a/pdfrw/pagemerge.py
+++ b/pdfrw/pagemerge.py
@@ -176,8 +176,8 @@ class PageMerge(list):
return self
def render(self):
- def do_xobjs(xobj_list):
- content = []
+ def do_xobjs(xobj_list, restore_first=False):
+ content = ['Q'] if restore_first else []
for obj in xobj_list:
index = PdfName('pdfrw_%d' % (key_offset + len(xobjs)))
if xobjs.setdefault(index, obj) is not obj:
@@ -199,9 +199,9 @@ class PageMerge(list):
allkeys = xobjs.keys()
if allkeys:
keys = (x for x in allkeys if x.startswith('/pdfrw_'))
- keys = (x for x in keys if x[6:].isdigit())
- keys = sorted(keys, key=lambda x: int(x[6:]))
- key_offset = (int(keys[-1][6:]) + 1) if keys else 0
+ keys = (x for x in keys if x[7:].isdigit())
+ keys = sorted(keys, key=lambda x: int(x[7:]))
+ key_offset = (int(keys[-1][7:]) + 1) if keys else 0
key_offset -= len(allkeys)
if old_contents is None:
@@ -213,10 +213,18 @@ class PageMerge(list):
index = self.index(None)
if index:
new_contents.append(do_xobjs(self[:index]))
- new_contents.extend(old_contents)
+
index += 1
if index < len(self):
- new_contents.append(do_xobjs(self[index:]))
+ # There are elements to add after the original page contents,
+ # so push the graphics state to the stack. Restored below.
+ new_contents.append(PdfDict(indirect=True, stream='q'))
+
+ new_contents.extend(old_contents)
+
+ if index < len(self):
+ # Restore graphics state and add other elements.
+ new_contents.append(do_xobjs(self[index:], restore_first=True))
if mbox is None:
cbox = None
diff --git a/pdfrw/pdfreader.py b/pdfrw/pdfreader.py
index 0baf0eb..c2ae030 100644
--- a/pdfrw/pdfreader.py
+++ b/pdfrw/pdfreader.py
@@ -19,7 +19,8 @@ from .errors import PdfParseError, log
from .tokens import PdfTokens
from .objects import PdfDict, PdfArray, PdfName, PdfObject, PdfIndirect
from .uncompress import uncompress
-from .py23_diffs import convert_load, iteritems
+from . import crypt
+from .py23_diffs import convert_load, convert_store, iteritems
class PdfReader(PdfDict):
@@ -265,8 +266,17 @@ class PdfReader(PdfDict):
for key in new:
self.loadindirect(key)
+ def decrypt_all(self):
+ self.read_all()
+
+ if self.crypt_filters is not None:
+ crypt.decrypt_objects(
+ self.indirect_objects.values(), self.stream_crypt_filter,
+ self.crypt_filters)
+
def uncompress(self):
self.read_all()
+
uncompress(self.indirect_objects.values())
def load_stream_objects(self, object_streams):
@@ -279,22 +289,26 @@ class PdfReader(PdfDict):
# read objects from stream
if objs:
+ # Decrypt
+ if self.crypt_filters is not None:
+ crypt.decrypt_objects(
+ objs, self.stream_crypt_filter, self.crypt_filters)
+
+ # Decompress
uncompress(objs)
+
for obj in objs:
objsource = PdfTokens(obj.stream, 0, False)
- snext = objsource.next
- offsets = {}
+ next = objsource.next
+ offsets = []
firstoffset = int(obj.First)
- num = snext()
- while num.isdigit():
- offset = int(snext())
- offsets[int(num)] = firstoffset + offset
- num = snext()
- for num, offset in iteritems(offsets):
+ while objsource.floc < firstoffset:
+ offsets.append((int(next()), firstoffset + int(next())))
+ for num, offset in offsets:
# Read the object, and call special code if it starts
# an array or dictionary
objsource.floc = offset
- sobj = snext()
+ sobj = next()
func = self.special.get(sobj)
if func is not None:
sobj = func(objsource)
@@ -332,7 +346,6 @@ class PdfReader(PdfDict):
'''
def readint(s, lengths):
- lengths = itertools.cycle(lengths)
offset = 0
for length in itertools.cycle(lengths):
next = offset + length
@@ -354,8 +367,13 @@ class PdfReader(PdfDict):
source.exception('Expected dict type of /XRef')
tok = next()
self.readstream(obj, self.findstream(obj, tok, source), source, True)
+ old_strm = obj.stream
if not uncompress([obj], True):
source.exception('Could not decompress Xref stream')
+ stream = obj.stream
+ # Fix for issue #76 -- goofy compressed xref stream
+ # that is NOT ACTUALLY COMPRESSED
+ stream = stream if stream is not old_strm else convert_store(old_strm)
num_pairs = obj.Index or PdfArray(['0', obj.Size])
num_pairs = [int(x) for x in num_pairs]
num_pairs = zip(num_pairs[0::2], num_pairs[1::2])
@@ -363,7 +381,7 @@ class PdfReader(PdfDict):
if len(entry_sizes) != 3:
source.exception('Invalid entry size')
object_streams = defaultdict(list)
- get = readint(obj.stream, entry_sizes)
+ get = readint(stream, entry_sizes)
for objnum, size in num_pairs:
for cnt in range(size):
xtype, p1, p2 = islice(get, 3)
@@ -431,7 +449,10 @@ class PdfReader(PdfDict):
''' Parse (one of) the cross-reference file section(s)
'''
next = source.next
- tok = next()
+ try:
+ tok = next()
+ except StopIteration:
+ tok = ''
if tok.isdigit():
return self.parse_xref_stream(source), True
elif tok == 'xref':
@@ -450,36 +471,92 @@ class PdfReader(PdfDict):
typename = PdfName.Type
kidname = PdfName.Kids
- # PDFs can have arbitrarily nested Pages/Page
- # dictionary structures.
- def readnode(node):
- nodetype = node[typename]
- if nodetype == pagename:
- yield node
- elif nodetype == pagesname:
- for node in node[kidname]:
- for node in readnode(node):
- yield node
- elif nodetype == catalogname:
- for node in readnode(node[pagesname]):
- yield node
- else:
- log.error('Expected /Page or /Pages dictionary, got %s' %
- repr(node))
try:
- return list(readnode(node))
+ result = []
+ stack = [node]
+ append = result.append
+ pop = stack.pop
+ while stack:
+ node = pop()
+ nodetype = node[typename]
+ if nodetype == pagename:
+ append(node)
+ elif nodetype == pagesname:
+ stack.extend(reversed(node[kidname]))
+ elif nodetype == catalogname:
+ stack.append(node[pagesname])
+ else:
+ log.error('Expected /Page or /Pages dictionary, got %s' %
+ repr(node))
+ return result
except (AttributeError, TypeError) as s:
log.error('Invalid page tree: %s' % s)
return []
- def __init__(self, fname=None, fdata=None, decompress=False,
- disable_gc=True, verbose=True):
+ def _parse_encrypt_info(self, source, password, trailer):
+ """Check password and initialize crypt filters."""
+ # Create and check password key
+ key = crypt.create_key(password, trailer)
+ if not crypt.check_user_password(key, trailer):
+ source.warning('User password does not validate')
+
+ # Create default crypt filters
+ private = self.private
+ crypt_filters = self.crypt_filters
+ version = int(trailer.Encrypt.V or 0)
+ if version in (1, 2):
+ crypt_filter = crypt.RC4CryptFilter(key)
+ private.stream_crypt_filter = crypt_filter
+ private.string_crypt_filter = crypt_filter
+ elif version == 4:
+ if PdfName.CF in trailer.Encrypt:
+ for name, params in iteritems(trailer.Encrypt.CF):
+ if name == PdfName.Identity:
+ continue
+
+ cfm = params.CFM
+ if cfm == PdfName.AESV2:
+ crypt_filters[name] = crypt.AESCryptFilter(key)
+ elif cfm == PdfName.V2:
+ crypt_filters[name] = crypt.RC4CryptFilter(key)
+ else:
+ source.warning(
+ 'Unsupported crypt filter: {}, {}'.format(
+ name, cfm))
+
+ # Read default stream filter
+ if PdfName.StmF in trailer.Encrypt:
+ name = trailer.Encrypt.StmF
+ if name in crypt_filters:
+ private.stream_crypt_filter = crypt_filters[name]
+ else:
+ source.warning(
+ 'Invalid crypt filter name in /StmF:'
+ ' {}'.format(name))
+
+ # Read default string filter
+ if PdfName.StrF in trailer.Encrypt:
+ name = trailer.Encrypt.StrF
+ if name in crypt_filters:
+ private.string_crypt_filter = crypt_filters[name]
+ else:
+ source.warning(
+ 'Invalid crypt filter name in /StrF:'
+ ' {}'.format(name))
+ else:
+ source.warning(
+ 'Unsupported Encrypt version: {}'.format(version))
+
+ def __init__(self, fname=None, fdata=None, decompress=False,
+ decrypt=False, password='', disable_gc=True, verbose=True):
self.private.verbose = verbose
+
# Runs a lot faster with GC off.
disable_gc = disable_gc and gc.isenabled()
if disable_gc:
gc.disable()
+
try:
if fname is not None:
assert fdata is None
@@ -494,8 +571,10 @@ class PdfReader(PdfDict):
except IOError:
raise PdfParseError('Could not read PDF file %s' %
fname)
- fdata = convert_load(fdata)
+
assert fdata is not None
+ fdata = convert_load(fdata)
+
if not fdata.startswith('%PDF-'):
startloc = fdata.find('%PDF-')
if startloc >= 0:
@@ -548,6 +627,23 @@ class PdfReader(PdfDict):
xref_list.append((source.obj_offsets, trailer, is_stream))
source.floc = int(prev)
+ # Handle document encryption
+ private.crypt_filters = None
+ if decrypt and PdfName.Encrypt in trailer:
+ identity_filter = crypt.IdentityCryptFilter()
+ crypt_filters = {
+ PdfName.Identity: identity_filter
+ }
+ private.crypt_filters = crypt_filters
+ private.stream_crypt_filter = identity_filter
+ private.string_crypt_filter = identity_filter
+
+ if not crypt.HAS_CRYPTO:
+ raise PdfParseError(
+ 'Install PyCrypto to enable encryption support')
+
+ self._parse_encrypt_info(source, password, trailer)
+
if is_stream:
self.load_stream_objects(trailer.object_streams)
@@ -566,6 +662,10 @@ class PdfReader(PdfDict):
float(trailer.Version) > float(self.version)):
self.private.version = trailer.Version
+ if decrypt:
+ self.decrypt_all()
+ trailer.Encrypt = None
+
if is_stream:
self.Root = trailer.Root
self.Info = trailer.Info
diff --git a/pdfrw/pdfwriter.py b/pdfrw/pdfwriter.py
index 644bb30..3c887ba 100755
--- a/pdfrw/pdfwriter.py
+++ b/pdfrw/pdfwriter.py
@@ -29,7 +29,7 @@ NullObject.Type = 'Null object'
def user_fmt(obj, isinstance=isinstance, float=float, str=str,
- basestring=str, encode=PdfString.encode):
+ basestring=(type(u''), type(b'')), encode=PdfString.encode):
''' This function may be replaced by the user for
specialized formatting requirements.
'''
@@ -137,11 +137,11 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
elif isinstance(obj, PdfDict):
if compress and obj.stream:
do_compress([obj])
- pairs = sorted((x, y, getattr(x, 'encoded', x))
+ pairs = sorted((getattr(x, 'encoded', None) or x, y)
for (x, y) in obj.iteritems())
myarray = []
- for key, value, encoding in pairs:
- myarray.append(encoding)
+ for key, value in pairs:
+ myarray.append(key)
myarray.append(add(value))
result = format_array(myarray, '<<%s>>')
stream = obj.stream
@@ -155,7 +155,7 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
# We assume that an object with an indirect
# attribute knows how to represent itself to us.
if hasattr(obj, 'indirect'):
- return str(getattr(obj, 'encoded', obj))
+ return str(getattr(obj, 'encoded', None) or obj)
return user_fmt(obj)
def format_deferred():
@@ -177,10 +177,10 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
# Don't reference old catalog or pages objects --
# swap references to new ones.
- swapobj = {PdfName.Catalog: trailer.Root,
+ type_remap = {PdfName.Catalog: trailer.Root,
PdfName.Pages: trailer.Root.Pages, None: trailer}.get
- swapobj = [(objid, swapobj(obj.Type))
- for objid, obj in iteritems(killobj)]
+ swapobj = [(objid, type_remap(obj.Type) if new_obj is None else new_obj)
+ for objid, (obj, new_obj) in iteritems(killobj)]
swapobj = dict((objid, obj is None and NullObject or obj)
for objid, obj in swapobj).get
@@ -225,11 +225,44 @@ class PdfWriter(object):
_trailer = None
canonicalize = False
+ fname = None
- def __init__(self, version='1.3', compress=False):
- self.pagearray = PdfArray()
- self.compress = compress
+ def __init__(self, fname=None, version='1.3', compress=False, **kwargs):
+ """
+ Parameters:
+ fname -- Output file name, or file-like binary object
+ with a write method
+ version -- PDF version to target. Currently only 1.3
+ supported.
+ compress -- True to do compression on output. Currently
+ compresses stream objects.
+ """
+
+ # Legacy support: fname is new, was added in front
+ if fname is not None:
+ try:
+ float(fname)
+ except (ValueError, TypeError):
+ pass
+ else:
+ if version != '1.3':
+ assert compress == False
+ compress = version
+ version = fname
+ fname = None
+
+ self.fname = fname
self.version = version
+ self.compress = compress
+
+ if kwargs:
+ for name, value in iteritems(kwargs):
+ if name not in self.replaceable:
+ raise ValueError("Cannot set attribute %s "
+ "on PdfWriter instance" % name)
+ setattr(self, name, value)
+
+ self.pagearray = PdfArray()
self.killobj = {}
def addpage(self, page):
@@ -251,13 +284,14 @@ class PdfWriter(object):
# Add parents in the hierarchy to objects we
# don't want to output
killobj = self.killobj
- obj = page.Parent
+ obj, new_obj = page, self.pagearray[-1]
while obj is not None:
objid = id(obj)
if objid in killobj:
break
- killobj[objid] = obj
+ killobj[objid] = obj, new_obj
obj = obj.Parent
+ new_obj = None
return self
addPage = addpage # for compatibility with pyPdf
@@ -300,10 +334,18 @@ class PdfWriter(object):
trailer = property(_get_trailer, _set_trailer)
- def write(self, fname, trailer=None, user_fmt=user_fmt,
+ def write(self, fname=None, trailer=None, user_fmt=user_fmt,
disable_gc=True):
+
trailer = trailer or self.trailer
+ # Support fname for legacy applications
+ if (fname is not None) == (self.fname is not None):
+ raise PdfOutputError(
+ "PdfWriter fname must be specified exactly once")
+
+ fname = fname or self.fname
+
# Dump the data. We either have a filename or a preexisting
# file object.
preexisting = hasattr(fname, 'write')
@@ -339,3 +381,5 @@ class PdfWriter(object):
workitems += obj
else:
workitems += obj.values()
+
+ replaceable = set(vars())
\ No newline at end of file
diff --git a/pdfrw/py23_diffs.py b/pdfrw/py23_diffs.py
index a0e0318..b3509d0 100644
--- a/pdfrw/py23_diffs.py
+++ b/pdfrw/py23_diffs.py
@@ -14,7 +14,9 @@ try:
except NameError:
def convert_load(s):
- return s.decode('Latin-1')
+ if isinstance(s, bytes):
+ return s.decode('Latin-1')
+ return s
def convert_store(s):
return s.encode('Latin-1')
@@ -44,3 +46,8 @@ try:
xrange = xrange
except NameError:
xrange = range
+
+try:
+ intern = intern
+except NameError:
+ from sys import intern
diff --git a/pdfrw/tokens.py b/pdfrw/tokens.py
index 5b061d5..2b69e02 100644
--- a/pdfrw/tokens.py
+++ b/pdfrw/tokens.py
@@ -15,7 +15,7 @@ import itertools
from .objects import PdfString, PdfObject
from .objects.pdfname import BasePdfName
from .errors import log, PdfParseError
-from .py23_diffs import nextattr
+from .py23_diffs import nextattr, intern
def linepos(fdata, loc):
@@ -64,19 +64,7 @@ class PdfTokens(object):
findparen = re.compile('(%s)[%s]*' % (p_literal_string_extend,
whitespace), re.DOTALL).finditer
- def _cacheobj(cache, obj, constructor):
- ''' This caching relies on the constructors
- returning something that will compare as
- equal to the original obj. This works
- fine with our PDF objects.
- '''
- result = cache.get(obj)
- if result is None:
- result = constructor(obj)
- cache[result] = result
- return result
-
- def _gettoks(self, startloc, cacheobj=_cacheobj,
+ def _gettoks(self, startloc, intern=intern,
delimiters=delimiters, findtok=findtok,
findparen=findparen, PdfString=PdfString,
PdfObject=PdfObject, BasePdfName=BasePdfName):
@@ -95,24 +83,23 @@ class PdfTokens(object):
fdata = self.fdata
current = self.current = [(startloc, startloc)]
cache = {}
+ get_cache = cache.get
while 1:
for match in findtok(fdata, current[0][1]):
current[0] = tokspan = match.span()
token = match.group(1)
firstch = token[0]
+ toktype = intern
if firstch not in delimiters:
- token = cacheobj(cache, token, PdfObject)
+ toktype = PdfObject
elif firstch in '/<(%':
if firstch == '/':
# PDF Name
- encoded = token
- token = cache.get(encoded)
- if token is None:
- token = cache[token] = BasePdfName(encoded)
+ toktype = BasePdfName
elif firstch == '<':
# << dict delim, or < hex string >
if token[1:2] != '<':
- token = cacheobj(cache, token, PdfString)
+ toktype = PdfString
elif firstch == '(':
# Literal string
# It's probably simple, but maybe not
@@ -145,7 +132,7 @@ class PdfTokens(object):
loc, ends, nest = ends
token = fdata[m_start:loc] + ')' * nest
current[0] = m_start, ends
- token = cacheobj(cache, token, PdfString)
+ toktype = PdfString
elif firstch == '%':
# Comment
if self.strip_comments:
@@ -154,7 +141,10 @@ class PdfTokens(object):
self.exception(('Tokenizer logic incorrect -- '
'should never get here'))
- yield token
+ newtok = get_cache(token)
+ if newtok is None:
+ newtok = cache[token] = toktype(token)
+ yield newtok
if current[0] is not tokspan:
break
else:
@@ -168,6 +158,7 @@ class PdfTokens(object):
self.iterator = iterator = self._gettoks(startloc)
self.msgs_dumped = None if verbose else set()
self.next = getattr(iterator, nextattr)
+ self.current = [(startloc, startloc)]
def setstart(self, startloc):
''' Change the starting location.
@@ -213,6 +204,8 @@ class PdfTokens(object):
msg %= arg
fdata = self.fdata
begin, end = self.current[0]
+ if begin >= len(fdata):
+ return '%s (filepos %s past EOF %s)' % (msg, begin, len(fdata))
line, col = linepos(fdata, begin)
if end > begin:
tok = fdata[begin:end].rstrip()
diff --git a/pdfrw/toreportlab.py b/pdfrw/toreportlab.py
index 9f77d26..3434fbf 100644
--- a/pdfrw/toreportlab.py
+++ b/pdfrw/toreportlab.py
@@ -108,7 +108,7 @@ def _makearray(rldoc, pdfobj):
def _makestr(rldoc, pdfobj):
assert isinstance(pdfobj, (float, int, str)), repr(pdfobj)
# TODO: Add fix for float like in pdfwriter
- return str(getattr(pdfobj, 'encoded', pdfobj))
+ return str(getattr(pdfobj, 'encoded', None) or pdfobj)
def makerl_recurse(rldoc, pdfobj):
diff --git a/pdfrw/uncompress.py b/pdfrw/uncompress.py
index 6780d5d..39e8308 100644
--- a/pdfrw/uncompress.py
+++ b/pdfrw/uncompress.py
@@ -12,7 +12,7 @@ PNG predictor were originally transcribed from PyPDF2, which is
probably an excellent source of additional filters.
'''
import array
-from .objects import PdfDict, PdfName
+from .objects import PdfDict, PdfName, PdfArray
from .errors import log
from .py23_diffs import zlib, xrange, from_array, convert_load, convert_store
@@ -37,7 +37,7 @@ def uncompress(mylist, leave_raw=False, warnings=set(),
if isinstance(ftype, list) and len(ftype) == 1:
# todo: multiple filters
ftype = ftype[0]
- parms = obj.DecodeParms
+ parms = obj.DecodeParms or obj.DP
if ftype != flate:
msg = ('Not decompressing: cannot use filter %s'
' with parameters %s') % (repr(ftype), repr(parms))
@@ -53,10 +53,18 @@ def uncompress(mylist, leave_raw=False, warnings=set(),
error = str(s)
else:
error = None
+ if isinstance(parms, PdfArray):
+ oldparms = parms
+ parms = PdfDict()
+ for x in oldparms:
+ parms.update(x)
if parms:
predictor = int(parms.Predictor or 1)
+ columns = int(parms.Columns or 1)
+ colors = int(parms.Colors or 1)
+ bpc = int(parms.BitsPerComponent or 8)
if 10 <= predictor <= 15:
- data, error = flate_png(data, parms)
+ data, error = flate_png(data, predictor, columns, colors, bpc)
elif predictor != 1:
error = ('Unsupported flatedecode predictor %s' %
repr(predictor))
@@ -74,7 +82,7 @@ def uncompress(mylist, leave_raw=False, warnings=set(),
return ok
-def flate_png(data, parms):
+def flate_png(data, predictor=1, columns=1, colors=1, bpc=8):
''' PNG prediction is used to make certain kinds of data
more compressible. Before the compression, each data
byte is either left the same, or is set to be a delta
@@ -87,9 +95,12 @@ def flate_png(data, parms):
this technique for Xref stream objects, which are
quite regular.
'''
- columns = int(parms.Columns)
+ columnbytes = ((columns * colors * bpc) + 7) // 8
data = array.array('B', data)
- rowlen = columns + 1
+ rowlen = columnbytes + 1
+ if predictor == 15:
+ padding = (rowlen - len(data)) % rowlen
+ data.extend([0] * padding)
assert len(data) % rowlen == 0
rows = xrange(0, len(data), rowlen)
for row_index in rows:
diff --git a/releasing.txt b/releasing.txt
index a108184..b186013 100644
--- a/releasing.txt
+++ b/releasing.txt
@@ -1,6 +1,6 @@
Notes on releasing, which is not yet fully automated:
-1) Update version number both in __init__ and in setup
+1) Update version number in pdfrw/__init__.py
2) Use pyroma
diff --git a/setup.py b/setup.py
index 7d94f95..a18132b 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,8 @@ setup(
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
-
+ 'Programming Language :: Python :: 3.5',
+ 'Programming Language :: Python :: 3.6',
'Topic :: Multimedia :: Graphics :: Graphics Conversion',
'Topic :: Software Development :: Libraries',
'Topic :: Text Processing',
@@ -35,4 +36,5 @@ setup(
'Topic :: Utilities',
],
keywords='pdf vector graphics PDF nup watermark split join merge',
+ zip_safe=True,
)
diff --git a/tests/expected.txt b/tests/expected.txt
index 64eecdd..b1b7cca 100644
--- a/tests/expected.txt
+++ b/tests/expected.txt
@@ -11,8 +11,8 @@ examples/subset_b1c400de699af29ea3f1983bb26870ab_1-3_5 880a9578197130273ccb
examples/unspread_d711b74110eefb4e9e6bf1a5bea16bfe 780a9abe26a9de0b5b95ee22c4835e4b
examples/cat_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c 62bb9b746ff5932d3f1b88942d36a81d
-examples/rotate_707e3e2d17cbe9ec2273414b3b63f333_270_1-4_7-8_10-50_52-56 841c980dfadf2cc47ad86e4649ca69b6
-examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c 41989bb2cb6225c6e14262ff5d4f151f
+examples/rotate_707e3e2d17cbe9ec2273414b3b63f333_270_1-4_7-8_10-50_52-56 7633ba56641115050ba098ecbef8d331
+examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c fe2330d42b3bfc06212415f295752f0e
examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c_-u e43e3ac0afe1cc242549424755dbf612
# All these are in the poster test
@@ -20,10 +20,10 @@ examples/subset_1975ef8db7355b1d691bc79d0749574b_21 5057f345f1a1109a0e54276a
examples/rotate_5057f345f1a1109a0e54276a68e8f8df_90_1 881f4dc8dcf069e707bf61af95492d86
examples/poster_881f4dc8dcf069e707bf61af95492d86 a34be06d22105b6c02394a9f278fec0d
-examples/rl1/4up_b1c400de699af29ea3f1983bb26870ab 959d6246ad8bda72bd023e8681216d17
-examples/rl1/booklet_b1c400de699af29ea3f1983bb26870ab 45b4ae29a038271896b7264bbed63bdf
-examples/rl1/subset_b1c400de699af29ea3f1983bb26870ab_3_5 822bce1cb9e053f1f3f6b922bf27fab8
-examples/rl1/platypus_pdf_template_b1c400de699af29ea3f1983bb26870ab 97ad6a8ca3fe7cc4e1f0ffb8475355e9
+examples/rl1/4up_b1c400de699af29ea3f1983bb26870ab e21dfdd9ae56ddb261dc3d02bf6da198
+examples/rl1/booklet_b1c400de699af29ea3f1983bb26870ab 410063b7fbae1c6d5af33758e2b43450
+examples/rl1/subset_b1c400de699af29ea3f1983bb26870ab_3_5 745f1ac31a18d86afb294a449b72cb98
+examples/rl1/platypus_pdf_template_b1c400de699af29ea3f1983bb26870ab 88bd087c4dc039ced05faea3920cbec5
# List things that need work here (typically cause exceptions)
@@ -68,32 +68,33 @@ repaginate/06c86654f9a77e82f9adaa0086fc391c.pdf 848966fe40a1e3de842f82700dc6d67b
repaginate/08f69084d72dabc5dfdcf5c1ff2a719f.pdf b8c60878b0e0ce81cb6e8777038166b1
repaginate/09715ec1a7b0f3a7ae02b3046f627b9f.pdf daf7cff9c0a15bbb347489f9fbda25f8
repaginate/0a61de50b5ee0ea4d5d69c95dab817a3.pdf c6cd38b1131c4b856f60ebfcf51da6f5
-repaginate/1975ef8db7355b1d691bc79d0749574b.pdf 53e5510be27db134edf3cf23873914af
+repaginate/1975ef8db7355b1d691bc79d0749574b.pdf 43433398ccb1edaaee734f4949a5cc3c
repaginate/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 20dc3be2affe9082564c01b1146d7598
-repaginate/1f5dd128c3757420a881a155f2f8ace3.pdf 019aead1450842406a04c508243e5161
-repaginate/22628a7ed578b622520325673ab2a4f2.pdf 255776a6956918c7b324dede711680ae
+repaginate/1f5dd128c3757420a881a155f2f8ace3.pdf 7130f1568526247895856806b3879db4
+repaginate/22628a7ed578b622520325673ab2a4f2.pdf e312c9c588a5ccdb1a11ac37149b178b
repaginate/2ac7c68e26a8ef797aead15e4875cc6d.pdf e7344551183415d6257e2cab2aef4a61
-repaginate/295d26e61a85635433f8e4b768953f60.pdf 13ece51f4d2ad25707982765abbcd789
+repaginate/295d26e61a85635433f8e4b768953f60.pdf a89a9fa39812ecd9fa5d6b9e785f389d
+repaginate/2d31f356c37dadd04b83ecc4e9a739a0.pdf bc04b61b41cb51f6a1c1da79fb387795
repaginate/2fac0d9a189ca5fcef8626153d050be8.pdf 95fe3d9258ace5bdccb95a55c2c8cb22
-repaginate/319c998910453bc44d40c7748cd2cb79.pdf c1a19d1acc3f172711bdbea000cf392e
+repaginate/319c998910453bc44d40c7748cd2cb79.pdf c0da6bf6db273bdb1385f408dcf063d0
repaginate/35df0b8cff4afec0c08f08c6a5bc9857.pdf 3568e1c885a461b350c790ec5b729af3
repaginate/365b9c95574ee8944370fe286905d0e8.pdf 84e5fc0d4f30ff8db05780fd244d9cf0
repaginate/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e
repaginate/49e31fd074eca6af981d78d42d0078ec.pdf 77fd3fa86c7c0166a373b66cfef357d2
-repaginate/536dfc6fbadd87c03eb59375d091eb53.pdf d0b7467d7bd6c7f73b7764b06c0be1aa
-repaginate/569f8094597bbe5b58efc3a7c6e14e87.pdf 6b0ab50c247ca43b70b2b2f27ee2c1a2
-repaginate/5f0cff36d0ad74536a6513a98a755016.pdf b65c2557988db8625c0761bab1d131f1
-repaginate/5f265db2736850782aeaba2571a3c749.pdf 9bb5644ede0ee7cf99642729eda76686
-repaginate/6a42c8c79b807bf164d31071749e07b0.pdf 33a231263e1a4203338b7b1052fc0091
-repaginate/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 93419e831e436d9093a153f35d3441c3
+repaginate/536dfc6fbadd87c03eb59375d091eb53.pdf afc90878b1306483dbde37c3a50b6a45
+repaginate/569f8094597bbe5b58efc3a7c6e14e87.pdf 894bf526c0a73ab70ebfd9bf3d614315
+repaginate/5f0cff36d0ad74536a6513a98a755016.pdf 3298a3a13439764102395a34d571ff69
+repaginate/5f265db2736850782aeaba2571a3c749.pdf 2e3046813ce6e40a39bd759a3c8a3c8c
+repaginate/6a42c8c79b807bf164d31071749e07b0.pdf bf00d5e44869ae59eb859860d7d5373f
+repaginate/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 612cdd84eeac797a1c42fc91756b6d9e
repaginate/7037a992b80b60f0294016037baa9292.pdf dd41b0104f185206b51e7ffe5b07d261
-repaginate/707e3e2d17cbe9ec2273414b3b63f333.pdf 6c65526ab372d72cb185933e3d2584ef
+repaginate/707e3e2d17cbe9ec2273414b3b63f333.pdf df4d756e2230c333f0c58ad354b5b51c
repaginate/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2
repaginate/72eb207b8f882618899aa7a65d3cecda.pdf 0b64f19a8a39fadfa2a3eec3f1a01233
repaginate/97ba0a239cefa0dc727c2f1be050ec6c.pdf a94fe7183ce8979174b2ac16dcd9b1ea
repaginate/9d8626d18b1d8807d271e6ffc409446a.pdf cdfcf8add1af9e612ba1a2ee06a6a273
repaginate/9f98322c243fe67726d56ccfa8e0885b.pdf 69503ac140a1e4f1322f9350646e3dae
-repaginate/c55eb9a13859a7fbddd8af9c16eba3a7.pdf b0d1f3925423f9c3ecf4a47baa949f75
+repaginate/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 8cddb0f9741f7515107b1bce5dc90c83
repaginate/c5c895deecf7a7565393587e0d61be2b.pdf 59e350c6f7d7b89fab36a4019bb526fd
repaginate/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf 3623b7f200818c63cb6838f9678a4840
repaginate/d6fd9567078b48c86710e9c49173781f.pdf 874b532f61139261f71afb5987dd2a68
@@ -101,6 +102,7 @@ repaginate/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 7d3c3ae13cc7d53e7fa6ef046e15dbaa
repaginate/ec00d5825f47b9d0faa953b1709163c3.pdf 8e6a481476c2b3bdd64ce8e36f8fe273
repaginate/ed81787b83cc317c9f049643b853bea3.pdf 4636b68f294302417b81aaaadde1c73d
+
simple/06c86654f9a77e82f9adaa0086fc391c.pdf 6e2a2e063de895d28dfea9aacb9fe469
simple/08f69084d72dabc5dfdcf5c1ff2a719f.pdf 5a41601f6033356539e623091a3f79ef
simple/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334
@@ -111,6 +113,7 @@ simple/1f5dd128c3757420a881a155f2f8ace3.pdf b0d01f9d6ac156326aeb14b940aa73e7
simple/22628a7ed578b622520325673ab2a4f2.pdf 1163cec415728899e997a29be465d02d
simple/295d26e61a85635433f8e4b768953f60.pdf fe3b8960c7f877db05c7cd12c9c6e097
simple/2ac7c68e26a8ef797aead15e4875cc6d.pdf 2623eae06eada9587574f8ddd7fc80fa
+simple/2d31f356c37dadd04b83ecc4e9a739a0.pdf 9af4794d366fbd5840836e6612ceedd2
simple/2fac0d9a189ca5fcef8626153d050be8.pdf 458501ecda909b00262b9654f0b09ebf
simple/319c998910453bc44d40c7748cd2cb79.pdf 8c84e36ec1db8c1dbfaa312646e000b4
simple/35df0b8cff4afec0c08f08c6a5bc9857.pdf 0a2926c23ad916c449d5dadcfa9d38ef
@@ -124,7 +127,7 @@ simple/5f265db2736850782aeaba2571a3c749.pdf d4d2e93ab22e866c86e32da84421f6f9
simple/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013
simple/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf fe8dd16dd7fef40338140e0610d0cbbf
simple/7037a992b80b60f0294016037baa9292.pdf 6a2ef24e5f74dd74969ff8cefdfc6a05
-simple/707e3e2d17cbe9ec2273414b3b63f333.pdf 4bdf1e57a96ce42717110b4e55098c1a
+simple/707e3e2d17cbe9ec2273414b3b63f333.pdf fb6a8eb3cdc2fbef125babe8815f3b70
simple/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2
simple/72eb207b8f882618899aa7a65d3cecda.pdf 4ce7ff29531cc417c26389af28dc1c5e
simple/97ba0a239cefa0dc727c2f1be050ec6c.pdf c24873bab85b8ecc7c5433d8d802bceb
@@ -138,3 +141,85 @@ simple/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 5bc96989bc4f4b6438da953443336124
simple/ec00d5825f47b9d0faa953b1709163c3.pdf 708f66049169c28ac39b0553908dc318
simple/ed81787b83cc317c9f049643b853bea3.pdf c227d627217dc6808c50e80063734d27
+
+decompress/06c86654f9a77e82f9adaa0086fc391c.pdf 6e2a2e063de895d28dfea9aacb9fe469
+decompress/07b0ba4cff1c6ff73fd468b04b013457.pdf 499b9c1b1e1c76b7c5c0d5e3b62889e3
+decompress/08f69084d72dabc5dfdcf5c1ff2a719f.pdf ccadb859eff77d525bf86f6d821ccf1b
+decompress/09715ec1a7b0f3a7ae02b3046f627b9f.pdf 2b9c8b26a92c7645cfefa1bfa8a8ab36
+decompress/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334
+decompress/1975ef8db7355b1d691bc79d0749574b.pdf a7d5eaf0a4259352898047f284e20b90
+decompress/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 40d1cc7e26213510319b519032aff637
+decompress/1f5dd128c3757420a881a155f2f8ace3.pdf b0d01f9d6ac156326aeb14b940aa73e7
+decompress/22628a7ed578b622520325673ab2a4f2.pdf b68c7bf46ad4b70addc3369ba669dc7b
+decompress/295d26e61a85635433f8e4b768953f60.pdf 6f2ae8fb0ff853ed63537d8767ce13ad
+decompress/2ac7c68e26a8ef797aead15e4875cc6d.pdf d8d5589991ce15c834f35b340e7147a9
+decompress/2d31f356c37dadd04b83ecc4e9a739a0.pdf 5a6b732690c42f07ae6a41c37cf28ff3
+decompress/2fac0d9a189ca5fcef8626153d050be8.pdf 998366ad30becd31bed711ba78c59a7f
+decompress/319c998910453bc44d40c7748cd2cb79.pdf 7933a591caf3d49e45a42733bc48f99e
+decompress/35df0b8cff4afec0c08f08c6a5bc9857.pdf e339ae7747898d2faba270473171692a
+decompress/365b9c95574ee8944370fe286905d0e8.pdf 9da0100b5844c86e93093d0fbc78b3f6
+decompress/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e
+decompress/49e31fd074eca6af981d78d42d0078ec.pdf 4e9bf31753ff7232de4c612a31bd21fc
+decompress/536dfc6fbadd87c03eb59375d091eb53.pdf f755d2ef6052270121168d2341ad04b6
+decompress/569f8094597bbe5b58efc3a7c6e14e87.pdf aa782a7d553ec767ab61517996337f58
+decompress/5f0cff36d0ad74536a6513a98a755016.pdf 9caae4e3a21eba9e4aa76620e7508d56
+decompress/5f265db2736850782aeaba2571a3c749.pdf 836abcf6e6e1d39ad96481eb20e9b149
+decompress/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013
+decompress/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 226773cac79e1a5fed1379a0501a5df0
+decompress/7037a992b80b60f0294016037baa9292.pdf c9a3602b26d82ae145d9f5822125a158
+decompress/707e3e2d17cbe9ec2273414b3b63f333.pdf 3250a56e14a9855eccd67bb347808d24
+decompress/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2
+decompress/72eb207b8f882618899aa7a65d3cecda.pdf a4366874fb6db1d9a0c998361ea32b8d
+decompress/97ba0a239cefa0dc727c2f1be050ec6c.pdf c24873bab85b8ecc7c5433d8d802bceb
+decompress/9d8626d18b1d8807d271e6ffc409446a.pdf 6498bd354bb221516517a4c49bcb94f6
+decompress/9f98322c243fe67726d56ccfa8e0885b.pdf 4b53b63b0779b81d8f9569e66ca3d8ee
+decompress/b107669d1dd69eabb89765fabb2cb321.pdf 56025c06ab8633575ddc6c6990d2fbf1
+decompress/b1c400de699af29ea3f1983bb26870ab.pdf 08a5de62129a96d8d9a8f27052bfb227
+decompress/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 8e0eb14c12fc89e7cbb4001861d7198f
+decompress/c5c895deecf7a7565393587e0d61be2b.pdf 2cc3c75e56d5dd562ca5b1f994bd9d5c
+decompress/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf aaed7215c60dbf19bb4fefe88602196a
+decompress/d6fd9567078b48c86710e9c49173781f.pdf 1fd1b4bc184e64ea6260c30261adf9c4
+decompress/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 62b87ec47f1b93d75c32d0c78b6c2380
+decompress/ec00d5825f47b9d0faa953b1709163c3.pdf 708f66049169c28ac39b0553908dc318
+decompress/ed81787b83cc317c9f049643b853bea3.pdf 5c0a3bc5b19d58d48767bff8f31daae0
+
+compress/06c86654f9a77e82f9adaa0086fc391c.pdf b6fb771b49971f2b63a197f3ef1531aa
+compress/07b0ba4cff1c6ff73fd468b04b013457.pdf 499b9c1b1e1c76b7c5c0d5e3b62889e3
+compress/08f69084d72dabc5dfdcf5c1ff2a719f.pdf 3e7e53a92f96d52bbffe3ffa03d7b11e
+compress/09715ec1a7b0f3a7ae02b3046f627b9f.pdf 563ffde527978517393d9166b02c17d3
+compress/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334
+compress/1975ef8db7355b1d691bc79d0749574b.pdf d505caa75f8becea1a1c810f4a143976
+compress/1c2af1d2b0db6cac3c8e558a26efd38b.pdf b78f4e45aef4149a068a0225ea1be88c
+compress/1f5dd128c3757420a881a155f2f8ace3.pdf 22148c2a65129f936b8e8c67397e5bf6
+compress/22628a7ed578b622520325673ab2a4f2.pdf 54ec1fa64e64bfd146f13001444346f4
+compress/295d26e61a85635433f8e4b768953f60.pdf 2ed8eb04a8c66138883a43917cd9c0c5
+compress/2ac7c68e26a8ef797aead15e4875cc6d.pdf efe942d1e5b9f2f139c7e1f2e46ced24
+compress/2d31f356c37dadd04b83ecc4e9a739a0.pdf eedc938e6782e1d15755b5c54fffc17c
+compress/2fac0d9a189ca5fcef8626153d050be8.pdf 2d1b8e82cdc82c82bec3969acf026d30
+compress/319c998910453bc44d40c7748cd2cb79.pdf 5b9ca8444a17db8cb6fa427da7a89e44
+compress/35df0b8cff4afec0c08f08c6a5bc9857.pdf 07c064df0fc0fd0c80c4a196b4c38403
+compress/365b9c95574ee8944370fe286905d0e8.pdf 1b98e92f74c2f5324cce5fc8fbe46c15
+compress/4805fdcd7e142e8df3c04c6ba06025af.pdf 4aa2e922739ba865da30a9917ddffe8e
+compress/49e31fd074eca6af981d78d42d0078ec.pdf 7422b3d205650552ff81bc06c89c13ba
+compress/536dfc6fbadd87c03eb59375d091eb53.pdf c18b0f0f8e633fe15b17772c701a76a9
+compress/569f8094597bbe5b58efc3a7c6e14e87.pdf 3ee711f7fc678787346dca5d06ee5192
+compress/5f0cff36d0ad74536a6513a98a755016.pdf bd2a1edf6299d5dc2e1ad6b5fc8bcc20
+compress/5f265db2736850782aeaba2571a3c749.pdf bb4898beac50171de7502f13925af80c
+compress/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013
+compress/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 1c3fbae41e7cad7deca13fab93514bc7
+compress/7037a992b80b60f0294016037baa9292.pdf 9182a9765544e4a91404db65a6f951d7
+compress/707e3e2d17cbe9ec2273414b3b63f333.pdf 0e75dda73bf18d9968499277ab1a367e
+compress/71a751ce2d93a6a5d6ff21735b701fb7.pdf faa7eb31789a3789f65de30a4e58e594
+compress/72eb207b8f882618899aa7a65d3cecda.pdf 0155549fc04357220cc6be541dda7bc1
+compress/97ba0a239cefa0dc727c2f1be050ec6c.pdf 067bfee3b2bd9c250e7c4157ff543a81
+compress/9d8626d18b1d8807d271e6ffc409446a.pdf 7c124d2d0b0c7b21cce91740dfb2a8fd
+compress/9f98322c243fe67726d56ccfa8e0885b.pdf 3167fa11a3f1f4a06f90294b21e101b7
+compress/b107669d1dd69eabb89765fabb2cb321.pdf 56025c06ab8633575ddc6c6990d2fbf1
+compress/b1c400de699af29ea3f1983bb26870ab.pdf 6eaeef32b0e28959e7681c8b02d8814f
+compress/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 6ef82921011eb79a9d860214e213c868
+compress/c5c895deecf7a7565393587e0d61be2b.pdf 30d87ac6aa59d65169c389ee3badbca8
+compress/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf e4c768be930e9980c970d51d5f447e24
+compress/d6fd9567078b48c86710e9c49173781f.pdf cbc8922b8bea08928463b287767ec229
+compress/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf e893e407b3c2366d4ca822ce80b45c2c
+compress/ec00d5825f47b9d0faa953b1709163c3.pdf 9ba3db0dedec74c3d2a6f033f1b22a81
+compress/ed81787b83cc317c9f049643b853bea3.pdf 2ceda401f68a44a3fb1da4e0f9dfc578
diff --git a/tests/test_examples.py b/tests/test_examples.py
index baa98a6..6871b80 100755
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -96,7 +96,7 @@ class TestOnePdf(unittest.TestCase):
os.remove(scrub)
subprocess.call(params)
if scrub:
- PdfWriter().addpages(PdfReader(scrub).pages).write(dstf)
+ PdfWriter(dstf).addpages(PdfReader(scrub).pages).write()
with open(dstf, 'rb') as f:
data = f.read()
size = len(data)
diff --git a/tests/test_pdfdict.py b/tests/test_pdfdict.py
new file mode 100755
index 0000000..cdbe15d
--- /dev/null
+++ b/tests/test_pdfdict.py
@@ -0,0 +1,39 @@
+#! /usr/bin/env python
+# encoding: utf-8
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas
+# 2016 James Laird-Wah, Sydney, Australia
+# MIT license -- See LICENSE.txt for details
+
+'''
+Run from the directory above like so:
+python -m tests.test_pdfstring
+'''
+
+
+from pdfrw import PdfDict, PdfName
+from pdfrw.objects import PdfIndirect
+
+import unittest
+
+
+class TestPdfDicts(unittest.TestCase):
+
+ def test_indirect_set_get(self):
+ io = PdfIndirect((1,2,3))
+ io.value = 42
+ d = PdfDict()
+ d.Name = io
+ test, = (x for x in dict.values(d))
+ self.assertEqual(test, io)
+ v = d['/Name']
+ self.assertEqual(v, io.value)
+ test, = d
+ self.assertEqual(type(test), type(PdfName.Name))
+
+def main():
+ unittest.main()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/tests/test_pdfreader_init.py b/tests/test_pdfreader_init.py
new file mode 100644
index 0000000..d27d752
--- /dev/null
+++ b/tests/test_pdfreader_init.py
@@ -0,0 +1,28 @@
+#! /usr/bin/env python
+import static_pdfs
+
+from pdfrw import PdfReader
+
+try:
+ import unittest2 as unittest
+except ImportError:
+ import unittest
+
+
+class TestPdfReaderInit(unittest.TestCase):
+
+ def test_fname_binary_filelike(self):
+ with open(static_pdfs.pdffiles[0][0], 'rb') as pdf_file:
+ PdfReader(pdf_file)
+
+ def test_fdata_binary(self):
+ with open(static_pdfs.pdffiles[0][0], 'rb') as pdf_file:
+ pdf_bytes = pdf_file.read()
+ PdfReader(fdata=pdf_bytes)
+
+
+def main():
+ unittest.main()
+
+if __name__ == '__main__':
+ main()
diff --git a/tests/test_pdfstring.py b/tests/test_pdfstring.py
old mode 100644
new mode 100755
index fce47ef..0ea91ad
--- a/tests/test_pdfstring.py
+++ b/tests/test_pdfstring.py
@@ -1,4 +1,9 @@
#! /usr/bin/env python
+# encoding: utf-8
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas
+# 2016 James Laird-Wah, Sydney, Australia
+# MIT license -- See LICENSE.txt for details
'''
Run from the directory above like so:
@@ -6,30 +11,106 @@ python -m tests.test_pdfstring
'''
-import pdfrw
+from pdfrw import PdfString
+from pdfrw.py23_diffs import convert_store
+
import unittest
-class TestEncoding(unittest.TestCase):
+class TestBaseEncoding(unittest.TestCase):
- @staticmethod
- def decode(value):
- return pdfrw.objects.PdfString(value).decode()
+ def encode(self, value):
+ x = PdfString.encode(value)
+ if isinstance(value, type(u'')):
+ y = PdfString.from_unicode(value)
+ else:
+ y = PdfString.from_bytes(value)
+ self.assertEqual(x, y)
+ return x
- @staticmethod
- def encode(value):
- return str(pdfrw.objects.PdfString.encode(value))
+ def decode(self, value):
+ s = PdfString(value)
+ x = s.to_unicode()
+ y = s.decode()
+ self.assertEqual(x, y)
+ return x
- @classmethod
- def encode_decode(cls, value):
- return cls.decode(cls.encode(value))
+ def decode_bytes(self, decode_this, expected):
+ """ Decode to bytes"""
+ self.assertEqual(PdfString(decode_this).to_bytes(),
+ convert_store(expected))
- def roundtrip(self, value):
- self.assertEqual(value, self.encode_decode(value))
+ def roundtrip(self, value, expected=None):
+ result = self.encode(value)
+ self.assertEqual(value, self.decode(result))
+ if expected is not None:
+ self.assertEqual(result, expected)
+ return result
def test_doubleslash(self):
self.roundtrip('\\')
+ self.roundtrip(r'\\')
+ def test_unicode_encoding(self):
+ # These chars are in PdfDocEncoding
+ self.assertEqual(self.roundtrip(u'PDF™©®')[0], '(')
+ # These chars are not in PdfDocEncoding
+ self.assertEqual(self.roundtrip(u'δΩσ')[0], '<')
+ # Check that we're doing a reasonable encoding
+ # Might want to change this later if we change the definition of reasonable
+ self.roundtrip(u'(\n\u00FF', '(\\(\n\xff)')
+ self.roundtrip(u'(\n\u0101', '')
+
+
+ def test_constructor(self):
+ obj = PdfString('hello')
+
+ def test_continuation(self):
+ # See PDF 1.7 ref section 3.2 page 55
+ s1 = PdfString('(These two strings are the same.)')
+ self.assertEqual(s1.decode(), s1[1:-1])
+ s2 = PdfString('(These \\\ntwo strings \\\nare the same.)')
+ self.assertEqual(s1.decode(), s2.decode())
+ s2 = PdfString(s2.replace('\n', '\r'))
+ self.assertEqual(s1.decode(), s2.decode())
+ s2 = PdfString(s2.replace('\r', '\r\n'))
+ self.assertEqual(s1.decode(), s2.decode())
+
+ def test_hex_whitespace(self):
+ # See PDF 1.7 ref section 3.2 page 56
+ self.assertEqual(self.decode('<41 \n\r\t\f\v42>'), 'AB')
+
+ def test_unicode_escaped_decode(self):
+ # Some PDF producers happily put unicode strings in PdfDocEncoding,
+ # because the Unicode BOM and \0 are valid code points
+ decoded = self.decode('(\xfe\xff\0h\0e\0l\0l\0o)')
+ self.assertEqual(decoded, "hello")
+
+
+ def test_unescaping(self):
+ self.decode_bytes(r'( \( \) \\ \n \t \f \r \r\n \\n)',
+ ' ( ) \\ \n \t \f \r \r\n \\n')
+
+ self.decode_bytes(r'(\b\010\10)', '\b\b\b')
+ self.decode_bytes('(\\n\n\\r\r\\t\t\\b\b\\f\f()\\1\\23\\0143)',
+ '\n\n\r\r\t\t\b\b\f\f()\001\023\f3')
+ self.decode_bytes(r'(\\\nabc)', '\\\nabc')
+ self.decode_bytes(r'(\ )', ' ')
+
+ def test_BOM_variants(self):
+ self.roundtrip(u'\ufeff', '')
+ self.roundtrip(u'\ufffe', '')
+ self.roundtrip(u'\xfe\xff', '')
+ self.roundtrip(u'\xff\xfe', '(\xff\xfe)')
+ self.assertRaises(UnicodeError, PdfString.from_unicode,
+ u'þÿ blah', text_encoding='pdfdocencoding')
+
+ def test_byte_encode(self):
+ self.assertEqual(self.encode(b'ABC'), '(ABC)')
+
+ def test_nullstring(self):
+ self.assertEqual(PdfString('<>').to_bytes(), b'')
+ self.assertEqual(PdfString('()').to_bytes(), b'')
def main():
unittest.main()
diff --git a/tests/test_roundtrip.py b/tests/test_roundtrip.py
index cb3645e..a8349a6 100755
--- a/tests/test_roundtrip.py
+++ b/tests/test_roundtrip.py
@@ -79,11 +79,12 @@ class TestOnePdf(unittest.TestCase):
result = 'skip -- encrypt'
hash = '------skip-encrypt-no-file------'
return self.skipTest('File encrypted')
- writer = pdfrw.PdfWriter(compress=compress)
+ writer = pdfrw.PdfWriter(dstf, compress=compress)
if repaginate:
writer.addpages(trailer.pages)
- trailer = None
- writer.write(dstf, trailer)
+ else:
+ writer.trailer = trailer
+ writer.write()
with open(dstf, 'rb') as f:
data = f.read()
size = len(data)
@@ -112,15 +113,20 @@ def build_tests():
def test(self):
self.roundtrip(*args, **kw)
return test
- for mytest, repaginate in (
- ('simple', False),
- ('repaginate', True)
+ for mytest, repaginate, decompress, compress in (
+ ('simple', False, False, False),
+ ('repaginate', True, False, False),
+ ('decompress', False, True, False),
+ ('compress', False, True, True),
):
for srcf in static_pdfs.pdffiles[0]:
basename = os.path.basename(srcf)
test_name = 'test_%s_%s' % (mytest, basename)
test = test_closure(mytest, basename, srcf,
- repaginate=repaginate)
+ repaginate=repaginate,
+ decompress=decompress,
+ compress=compress,
+ )
setattr(TestOnePdf, test_name, test)
build_tests()
diff --git a/tests/update_expected.py b/tests/update_expected.py
new file mode 100755
index 0000000..bed5331
--- /dev/null
+++ b/tests/update_expected.py
@@ -0,0 +1,84 @@
+#! /usr/bin/env python2
+"""
+Put old (good) results in ramdisk/reference,
+then generate new (unknown) test results in ramdisk/tmp_results,
+THEN SWITCH BACK TO KNOWN GOOD SYSTEM, and finally:
+
+run this to update any checksums in expected.txt where both versions
+parse to same PDFs.
+"""
+
+import os
+import hashlib
+from pdfrw import PdfReader, PdfWriter, PdfArray, PdfDict, PdfObject
+
+
+def make_canonical(trailer):
+ ''' Canonicalizes a PDF. Assumes everything
+ is a Pdf object already.
+ '''
+ visited = set()
+ workitems = list(trailer.values())
+ while workitems:
+ obj = workitems.pop()
+ objid = id(obj)
+ if objid in visited:
+ continue
+ visited.add(objid)
+ obj.indirect = True
+ if isinstance(obj, (PdfArray, PdfDict)):
+ if isinstance(obj, PdfArray):
+ workitems += obj
+ else:
+ workitems += obj.values()
+ return trailer
+
+with open('expected.txt', 'rb') as f:
+ expected = f.read()
+
+def get_digest(fname):
+ with open(fname, 'rb') as f:
+ data = f.read()
+ if data:
+ return hashlib.md5(data).hexdigest()
+
+tmp = '_temp.pdf'
+count = 0
+goodcount = 0
+
+changes = []
+for (srcpath, _, filenames) in os.walk('ramdisk/reference'):
+ for name in filenames:
+ if not name.endswith('.pdf'):
+ continue
+ src = os.path.join(srcpath, name)
+ dst = src.replace('/reference/', '/tmp_results/')
+ if not os.path.exists(dst):
+ continue
+ src_digest = get_digest(src)
+ if not src_digest or src_digest not in expected:
+ continue
+ print src
+ count += 1
+ trailer = make_canonical(PdfReader(src))
+ out = PdfWriter(tmp)
+ out.write(trailer=trailer)
+ match_digest = get_digest(tmp)
+ if not match_digest:
+ continue
+ trailer = make_canonical(PdfReader(dst))
+ out = PdfWriter(tmp)
+ out.write(trailer=trailer)
+ if get_digest(tmp) != match_digest:
+ continue
+ goodcount += 1
+ print "OK"
+ changes.append((src_digest, get_digest(dst)))
+
+print count, goodcount
+
+for stuff in changes:
+ expected = expected.replace(*stuff)
+
+with open('expected.txt', 'wb') as f:
+ f.write(expected)