diff --git a/.gitignore b/.gitignore index 6260e55..b4c3391 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,17 @@ +# OSX +.DS_Store +.AppleDouble +.LSOverride +Icon + +# Thumbnails +._* + +# Files that might appear on external disk +.Spotlight-V100 +.Trashes + + # Development artifacts diffs.txt examples/*.pdf @@ -9,6 +23,7 @@ tests/pdfrw tests/static_pdfs tests/ramdisk tests/saved_results +tests/tmp_results wiki/ diff --git a/.travis.yml b/.travis.yml index caa88f5..dcdd573 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,8 @@ python: - "2.7" - "3.3" - "3.4" + - "3.5" + - "3.6" - "nightly" # command to install dependencies before_install: @@ -11,6 +13,7 @@ before_install: install: - "pip install ." - "pip install reportlab || true" + - "pip install PyCrypto || true" - "pip install zlib || true" - "pip install unittest2 || true" # command to run tests diff --git a/LICENSE.txt b/LICENSE.txt index 8d3c13d..e176dc4 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -8,10 +8,22 @@ Mathieu Fenniak and licensed under the BSD license (also reproduced below). Please add any missing authors here: -Copyright (c) 2006-2015 Patrick Maupin. All rights reserved. +Copyright (c) 2006-2017 Patrick Maupin. All rights reserved. Copyright (c) 2006 Mathieu Fenniak. All rights reserved. Copyright (c) 2010 Attila Tajti. All rights reserved. Copyright (c) 2012 Nerijus Mika. All rights reserved. +Copyright (c) 2015 Bastien Gandouet. All rights reserved. +Copyright (c) 2015 Tzerjen Wei. All rights reserved. +Copyright (c) 2015 Jorj X. McKie. All rights reserved. +Copyright (c) 2015 Nicholas Devenish. All rights reserved. +Copyright (c) 2015-2016 Jonatan Dellagostin. All rights reserved. +Copyright (c) 2016-2017 Thomas Kluyver. All rights reserved. +Copyright (c) 2016 James Laird-Wah. All rights reserved. +Copyright (c) 2016 Marcus Brinkmann. All rights reserved. +Copyright (c) 2016 Edward Betts. All rights reserved. +Copyright (c) 2016 Patrick Mazulo. All rights reserved. +Copyright (c) 2017 Haochen Wu. All rights reserved. +Copyright (c) 2017 Jon Lund Steffensen. All rights reserved. MIT License: diff --git a/MANIFEST.in b/MANIFEST.in index f90ac68..493839e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ include *.txt *.in *.rst recursive-include examples *.txt *.py +recursive-include tests *.py diff --git a/README.rst b/README.rst index 2c91345..dc4a52c 100644 --- a/README.rst +++ b/README.rst @@ -1,6 +1,6 @@ -============= -pdfrw 0.2b1 -============= +================== +pdfrw 0.4 +================== :Author: Patrick Maupin @@ -14,7 +14,7 @@ Introduction **pdfrw** is a Python library and utility that reads and writes PDF files: -* Version 0.2 is tested and works on Python 2.6, 2.7, 3.3, and 3.4. +* Version 0.4 is tested and works on Python 2.6, 2.7, 3.3, 3.4, 3.5, and 3.6 * Operations include subsetting, merging, rotating, modifying metadata, etc. * The fastest pure Python PDF parser available * Has been used for years by a printer in pre-press production @@ -74,10 +74,13 @@ try to use pdftk to uncompress and/or unencrypt them first. output. * `rl1/subset.py`__ Another subsetting example, using reportlab canvas for output. -* `rl1/platypus_pdf_template.py`__ Aother watermarking example, using +* `rl1/platypus_pdf_template.py`__ Another watermarking example, using reportlab canvas and generated output for the document. Contributed by user asannes. * `rl2`__ Experimental code for parsing graphics. Needs work. +* `subset_booklets.py`__ shows an example of creating a full printable pdf + version in a more professional and pratical way ( take a look at + http://www.wikihow.com/Bind-a-Book ) __ https://github.com/pmaupin/pdfrw/tree/master/examples/4up.py __ https://github.com/pmaupin/pdfrw/tree/master/examples/alter.py @@ -95,6 +98,7 @@ __ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/booklet.py __ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/subset.py __ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/platypus_pdf_template.py __ https://github.com/pmaupin/pdfrw/tree/master/examples/rl2/ +__ https://github.com/pmaupin/pdfrw/tree/master/examples/subset_booklets.py Notes on selected examples ------------------------------------ @@ -715,6 +719,8 @@ non-pure-Python libraries files. - `pycairo `__ can write PDF files. +- `PyMuPDF `_ high performance rendering + of PDF, (Open)XPS, CBZ and EPUB Other tools ----------- @@ -723,12 +729,50 @@ Other tools line tool for basic PDF manipulation. It complements pdfrw extremely well, supporting many operations such as decryption and decompression that pdfrw cannot do. +- `MuPDF `_ is a free top performance PDF, (Open)XPS, CBZ and EPUB rendering library + that also comes with some command line tools. One of those, ``mutool``, has big overlaps with pdftk's - + except it is up to 10 times faster. Release information ======================= Revisions: +0.4 -- Released 18 September, 2017 + + - Python 3.6 added to test matrix + - Proper unicode support for text strings in PDFs added + - buildxobj fixes allow better support creating form XObjects + out of compressed pages in some cases + - Compression fixes for Python 3+ + - New subset_booklets.py example + - Bug with non-compressed indices into compressed object streams fixed + - Bug with distinguishing compressed object stream first objects fixed + - Better error reporting added for some invalid PDFs (e.g. when reading + past the end of file) + - Better scrubbing of old bookmark information when writing PDFs, to + remove dangling references + - Refactoring of pdfwriter, including updating API, to allow future + enhancements for things like incremental writing + - Minor tokenizer speedup + - Some flate decompressor bugs fixed + - Compression and decompression tests added + - Tests for new unicode handling added + - PdfReader.readpages() recursion error (issue #92) fixed. + - Initial crypt filter support added + + +0.3 -- Released 19 October, 2016. + + - Python 3.5 added to test matrix + - Better support under Python 3.x for in-memory PDF file-like objects + - Some pagemerge and Unicode patches added + - Changes to logging allow better coexistence with other packages + - Fix for "from pdfrw import \*" + - New fancy_watermark.py example shows off capabilities of pagemerge.py + - metadata.py example renamed to cat.py + + 0.2 -- Released 21 June, 2015. Supports Python 2.6, 2.7, 3.3, and 3.4. - Several bugs have been fixed diff --git a/debian/changelog b/debian/changelog index a0836e5..3c09d9c 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,32 @@ +pdfrw (0.4-2) unstable; urgency=medium + + * Bumped Standards-Version to 4.1.3 + * Replaced python-reportlab in python3-pdfrw by python3-reportlab + + -- Rodrigo Siqueira Thu, 12 Apr 2018 12:14:12 -0300 + +pdfrw (0.4-1) unstable; urgency=medium + + * New upstream version + * Added "Multi-Arch: foreign" to python-pdfrw-doc + + [ Lucas Kanashiro ] + * Update years of upstream copyright + * debian/copyright: use https:// instead of http:// in Format field + + -- Rodrigo Siqueira Thu, 21 Sep 2017 09:55:46 -0300 + +pdfrw (0.3-1) unstable; urgency=medium + + * New maintainer (Closes: #738298) + * New upstream version + * Bumped Standards-Version to 4.0.0 + * Bumped debian/compat to 10 + * Depend on debhelper >= 10 + * Added package test with autopkgtests tool + + -- Rodrigo Siqueira Wed, 30 Aug 2017 19:18:45 -0300 + pdfrw (0.2-3) unstable; urgency=medium * QA upload. diff --git a/debian/compat b/debian/compat index ec63514..f599e28 100644 --- a/debian/compat +++ b/debian/compat @@ -1 +1 @@ -9 +10 diff --git a/debian/control b/debian/control index 00ee686..4c3f8c7 100644 --- a/debian/control +++ b/debian/control @@ -1,20 +1,21 @@ Source: pdfrw Section: python Priority: optional -Maintainer: Debian QA Group +Maintainer: Rodrigo Siqueira Build-Depends: - debhelper (>= 9), + debhelper (>= 10), dh-python, python-all (>= 2.6.6-3~), python-setuptools, python3-all, python3-setuptools, -Standards-Version: 3.9.8 +Standards-Version: 4.1.3 Homepage: https://github.com/pmaupin/pdfrw Vcs-Git: https://git.dgit.debian.org/pdfrw Vcs-Browser: https://browse.dgit.debian.org/pdfrw.git/ X-Python-Version: >= 2.6 X-Python3-Version: >= 3.2 +Testsuite: autopkgtest-pkg-python Package: python-pdfrw Architecture: all @@ -44,6 +45,7 @@ Description: PDF file manipulation library (Python 2) Package: python-pdfrw-doc Architecture: all +Multi-Arch: foreign Depends: ${misc:Depends}, Section: doc @@ -72,7 +74,7 @@ Depends: ${python3:Depends}, Suggests: python-pdfrw-doc, - python-reportlab, + python3-reportlab, Description: PDF file manipulation library (Python 3) pdfrw can read and write PDF files, and can also be used to read in PDFs which can then be used inside reportlab. diff --git a/debian/copyright b/debian/copyright index bb4cf16..679ba8f 100644 --- a/debian/copyright +++ b/debian/copyright @@ -1,10 +1,10 @@ -Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Upstream-Name: pdfrw Upstream-Contact: Patrick Maupin Source: https://github.com/pmaupin/pdfrw Files: * -Copyright: © 2006-2015 Patrick Maupin +Copyright: © 2006-2017 Patrick Maupin © 2010 Attila Tajti © 2012 Narijus Mika License: Expat diff --git a/examples/4up.py b/examples/4up.py index ad2bd3b..91ac64e 100755 --- a/examples/4up.py +++ b/examples/4up.py @@ -27,7 +27,7 @@ def get4(srcpages): inpfn, = sys.argv[1:] outfn = '4up.' + os.path.basename(inpfn) pages = PdfReader(inpfn).pages -writer = PdfWriter() +writer = PdfWriter(outfn) for index in range(0, len(pages), 4): writer.addpage(get4(pages[index:index + 4])) -writer.write(outfn) +writer.write() diff --git a/examples/README.txt b/examples/README.txt index 242f5be..5564501 100644 --- a/examples/README.txt +++ b/examples/README.txt @@ -6,7 +6,7 @@ alter.py -- Simple example of making a very slight modification to a PDF. booklet.py -- Converts a PDF into a booklet. -metadata.py -- Concatenates multiple PDFs, adds metadata. +cat.py -- Concatenates multiple PDFs, adds metadata. poster.py -- Changes the size of a PDF to create a poster diff --git a/examples/alter.py b/examples/alter.py index 45b9c76..bb236fa 100755 --- a/examples/alter.py +++ b/examples/alter.py @@ -19,6 +19,4 @@ outfn = 'alter.' + os.path.basename(inpfn) trailer = PdfReader(inpfn) trailer.Info.Title = 'My New Title Goes Here' -writer = PdfWriter() -writer.trailer = trailer -writer.write(outfn) +PdfWriter(outfn, trailer=trailer).write() diff --git a/examples/booklet.py b/examples/booklet.py index 4758b08..e6b523d 100755 --- a/examples/booklet.py +++ b/examples/booklet.py @@ -1,16 +1,23 @@ #!/usr/bin/env python ''' -usage: booklet.py my.pdf +usage: booklet.py [-p] my.pdf Creates booklet.my.pdf Pages organized in a form suitable for booklet printing, e.g. to print 4 8.5x11 pages using a single 11x17 sheet (double-sided). + +The output would be using the same type of sheet +and you can get up to 3 blank sides if -p is enabled. + +Otherwise the two sides in the middle will be in original page size +and you can have 1 blank sides at most. + ''' -import sys import os +import argparse from pdfrw import PdfReader, PdfWriter, PageMerge @@ -21,13 +28,23 @@ def fixpage(*pages): return result.render() -inpfn, = sys.argv[1:] +parser = argparse.ArgumentParser() +parser.add_argument("input", help="Input pdf file name") +parser.add_argument("-p", "--padding", action = "store_true", + help="Padding the document so that all pages use the same type of sheet") +args = parser.parse_args() + +inpfn = args.input outfn = 'booklet.' + os.path.basename(inpfn) ipages = PdfReader(inpfn).pages -# Make sure we have an even number -if len(ipages) & 1: - ipages.append(None) +if args.padding: + pad_to = 4 +else: + pad_to = 2 + +# Make sure we have a correct number of sides +ipages += [None]*(-len(ipages)%pad_to) opages = [] while len(ipages) > 2: @@ -36,4 +53,4 @@ while len(ipages) > 2: opages += ipages -PdfWriter().addpages(opages).write(outfn) +PdfWriter(outfn).addpages(opages).write() diff --git a/examples/extract.py b/examples/extract.py index 3756b4f..dd6e267 100755 --- a/examples/extract.py +++ b/examples/extract.py @@ -22,6 +22,6 @@ outfn = 'extract.' + os.path.basename(inpfn) pages = list(page_per_xobj(PdfReader(inpfn).pages, margin=0.5*72)) if not pages: raise IndexError("No XObjects found") -writer = PdfWriter() +writer = PdfWriter(outfn) writer.addpages(pages) -writer.write(outfn) +writer.write() diff --git a/examples/fancy_watermark.py b/examples/fancy_watermark.py new file mode 100755 index 0000000..e9c797d --- /dev/null +++ b/examples/fancy_watermark.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python + +''' +Enhanced example of watermarking using form xobjects (pdfrw). + +usage: fancy_watermark.py [-u] my.pdf single_page.pdf + +Creates watermark.my.pdf, with every page overlaid with +first page from single_page.pdf. If -u is selected, watermark +will be placed underneath page (painted first). + +The stock watermark.py program assumes all pages are the same +size. This example deals with pages of differing sizes in order +to show some concepts of positioning and scaling. + +This version applies the watermark such that the upper right +corner of the watermark is at the upper right corner of the +document page for odd pages, and at the upper left corner +of the document page for even pages, for each page of the +document. + +It also rescales the size of the watermark if the watermark +is too wide for the page. + +These scaling and positioning adjustments can easily +be customized for any particular application. + +To handle documents with different page sizes, a cache is +maintained of a modified intermediate watermark object +for each page size. +''' + +import sys +import os + +from pdfrw import PdfReader, PdfWriter, PageMerge + +# Get all the filenames + +argv = sys.argv[1:] +underneath = '-u' in argv +if underneath: + del argv[argv.index('-u')] +inpfn, wmarkfn = argv +outfn = 'watermark.' + os.path.basename(inpfn) + +# Open both the source files +wmark_trailer = PdfReader(wmarkfn) +trailer = PdfReader(inpfn) + +# Handle different sized pages in same document with +# a memoization cache, so we don't create more watermark +# objects than we need to (typically only one per document). + +wmark_page = wmark_trailer.pages[0] +wmark_cache = {} + +# Process every page +for pagenum, page in enumerate(trailer.pages, 1): + + # Get the media box of the page, and see + # if we have a matching watermark in the cache + mbox = tuple(float(x) for x in page.MediaBox) + odd = pagenum & 1 + key = mbox, odd + wmark = wmark_cache.get(key) + if wmark is None: + + # Create and cache a new watermark object. + wmark = wmark_cache[key] = PageMerge().add(wmark_page)[0] + + # The math is more complete than it probably needs to be, + # because the origin of all pages is almost always (0, 0). + # Nonetheless, we illustrate all the values and their names. + + page_x, page_y, page_x1, page_y1 = mbox + page_w = page_x1 - page_x + page_h = page_y1 - page_y # For illustration, not used + + # Scale the watermark if it is too wide for the page + # (Could do the same for height instead if needed) + if wmark.w > page_w: + wmark.scale(1.0 * page_w / wmark.w) + + # Always put watermark at the top of the page + # (but see horizontal positioning for other ideas) + wmark.y += page_y1 - wmark.h + + # For odd pages, put it at the left of the page, + # and for even pages, put it on the right of the page. + if odd: + wmark.x = page_x + else: + wmark.x += page_x1 - wmark.w + + # Optimize the case where the watermark is same width + # as page. + if page_w == wmark.w: + wmark_cache[mbox, not odd] = wmark + + # Add the watermark to the page + PageMerge(page).add(wmark, prepend=underneath).render() + +# Write out the destination file +PdfWriter(outfn, trailer=trailer).write() diff --git a/examples/poster.py b/examples/poster.py index 7f1c1c2..1db9378 100755 --- a/examples/poster.py +++ b/examples/poster.py @@ -37,7 +37,7 @@ def adjust(page, margin=36, scale=4.8): inpfn, = sys.argv[1:] outfn = 'poster.' + os.path.basename(inpfn) reader = PdfReader(inpfn) -writer = PdfWriter() +writer = PdfWriter(outfn) writer.addpage(adjust(reader.pages[0])) writer.trailer.Info = IndirectPdfDict(reader.Info or {}) -writer.write(outfn) +writer.write() diff --git a/examples/print_two.py b/examples/print_two.py index c54eaee..b710192 100755 --- a/examples/print_two.py +++ b/examples/print_two.py @@ -29,4 +29,4 @@ def fixpage(page, count=[0]): inpfn, = sys.argv[1:] outfn = 'print_two.' + os.path.basename(inpfn) pages = PdfReader(inpfn).pages -PdfWriter().addpages(fixpage(x) for x in pages).write(outfn) +PdfWriter(outfn).addpages(fixpage(x) for x in pages).write() diff --git a/examples/rl2/decodegraphics.py b/examples/rl2/decodegraphics.py index e2f3a9f..d26daf7 100644 --- a/examples/rl2/decodegraphics.py +++ b/examples/rl2/decodegraphics.py @@ -232,6 +232,19 @@ def parse_text_out(self, token='Tj', params='t'): text = params[0].decode(self.curfont.remap, self.curfont.twobyte) self.tpath.textOut(text) +def parse_lf_text_out(self, token="'", params='t'): + self.tpath.textLine() + text = params[0].decode(self.curfont.remap, self.curfont.twobyte) + self.tpath.textOut(text) + + +def parse_lf_text_out_with_spacing(self, token='"', params='fft'): + self.tpath.setWordSpace(params[0]) + self.tpath.setCharSpace(params[1]) + self.tpath.textLine() + text = params[2].decode(self.curfont.remap, self.curfont.twobyte) + self.tpath.textOut(text) + def parse_TJ(self, token='TJ', params='a'): remap = self.curfont.remap @@ -377,7 +390,7 @@ class _ParseClass(object): self.gpath = None self.tpath = None self.fontdict = dict((x, FontInfo(y)) for - (x, y) in page.Resources.Font.iteritems()) + (x, y) in page.Resources.Font.items()) for token in self.tokens: info = dispatch(token) @@ -424,7 +437,7 @@ def debugparser(undisturbed=set('parse_array'.split())): myfunc = oldval[0] return myfunc, oldval[1] return dict((x, getvalue(y)) - for (x, y) in _ParseClass.dispatch.iteritems()) + for (x, y) in _ParseClass.dispatch.items()) class _DebugParse(_ParseClass): dispatch = debugdispatch() @@ -435,10 +448,10 @@ parsepage = _ParseClass.parsepage if __name__ == '__main__': import sys - from pdfreader import PdfReader + from pdfrw import PdfReader parse = debugparser() fname, = sys.argv[1:] - pdf = PdfReader(fname) + pdf = PdfReader(fname, decompress=True) for i, page in enumerate(pdf.pages): print ('\nPage %s ------------------------------------' % i) parse(page) diff --git a/examples/rotate.py b/examples/rotate.py index 8b10d05..0115401 100755 --- a/examples/rotate.py +++ b/examples/rotate.py @@ -36,6 +36,6 @@ for onerange in ranges: pages[pagenum].Rotate = (int(pages[pagenum].inheritable.Rotate or 0) + rotate) % 360 -outdata = PdfWriter() +outdata = PdfWriter(outfn) outdata.trailer = trailer -outdata.write(outfn) +outdata.write() diff --git a/examples/subset.py b/examples/subset.py index 30a577a..e965850 100755 --- a/examples/subset.py +++ b/examples/subset.py @@ -20,10 +20,10 @@ assert ranges, "Expected at least one range" ranges = ([int(y) for y in x.split('-')] for x in ranges) outfn = 'subset.%s' % os.path.basename(inpfn) pages = PdfReader(inpfn).pages -outdata = PdfWriter() +outdata = PdfWriter(outfn) for onerange in ranges: onerange = (onerange + onerange[-1:])[:2] for pagenum in range(onerange[0], onerange[1]+1): outdata.addpage(pages[pagenum-1]) -outdata.write(outfn) +outdata.write() diff --git a/examples/subset_booklets.py b/examples/subset_booklets.py new file mode 100755 index 0000000..db0b9af --- /dev/null +++ b/examples/subset_booklets.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python + +''' +usage: subset_booklets.py my.pdf + +Creates subset_booklets.my.pdf + +Pages organized in a form suitable for booklet printing, e.g. +to print 4 8.5x11 pages using a single 11x17 sheet (double-sided). +Instead of a large booklet, the pdf is divided into several mini +booklets. The reason is: professional printing works this way: + - Print all of several mini booklets(subsets of booklet); + - Saw each mini booklet individually; + - glue them all together; + - Insert the cover. + + Take a look at http://www.wikihow.com/Bind-a-Book +''' + +import sys +import os +import time +from pdfrw import PdfReader, PdfWriter, PageMerge + +BOOKLET_SIZE = 20 +START = time.time() + +def fixpage(*pages): + result = PageMerge() + (x for x in pages if x is not None) + result[-1].x += result[0].w + return result.render() + +INPFN, = sys.argv[1:] +OUTFN = 'booklet.' + os.path.basename(INPFN) +ALL_IPAGES = PdfReader(INPFN).pages +print 'The pdf file '+str(INPFN)+' has '+str(len(ALL_IPAGES))+' pages.' + +#Make sure we have an even number +if len(ALL_IPAGES) & 1: + ALL_IPAGES.append(None) + print 'Inserting one more blank page to make pages number even.' +NUM_OF_ITER, ITERS_LEFT = divmod(len(ALL_IPAGES), BOOKLET_SIZE) + +print 'Making '+str(NUM_OF_ITER)+' subbooklets of '+str(BOOKLET_SIZE)+' pages each.' +opages = [] +for iteration in range(0, NUM_OF_ITER): + ipages = ALL_IPAGES[iteration*BOOKLET_SIZE:(iteration+1)*BOOKLET_SIZE] + while len(ipages) > 2: + opages.append(fixpage(ipages.pop(), ipages.pop(0))) + opages.append(fixpage(ipages.pop(0), ipages.pop())) + +# Making one more subbooklet with the left pages +ipages = ALL_IPAGES[len(ALL_IPAGES)-ITERS_LEFT:len(ALL_IPAGES)] +while len(ipages) > 2: + opages.append(fixpage(ipages.pop(), ipages.pop(0))) + opages.append(fixpage(ipages.pop(0), ipages.pop())) +if len(ipages) >= 1: + opages.append(fixpage(ipages.pop(), ipages.pop(0))) + +PdfWriter(OUTFN).addpages(opages).write() +print 'It took '+ str(round(time.time()-START, 2))+' seconds to make the pdf subbooklets changes.' diff --git a/examples/unspread.py b/examples/unspread.py index 4b3bc5d..4caa973 100755 --- a/examples/unspread.py +++ b/examples/unspread.py @@ -26,7 +26,7 @@ def splitpage(src): inpfn, = sys.argv[1:] outfn = 'unspread.' + os.path.basename(inpfn) -writer = PdfWriter() +writer = PdfWriter(outfn) for page in PdfReader(inpfn).pages: writer.addpages(splitpage(page)) -writer.write(outfn) +writer.write() diff --git a/examples/watermark.py b/examples/watermark.py index 96b686b..1188502 100755 --- a/examples/watermark.py +++ b/examples/watermark.py @@ -9,10 +9,14 @@ Creates watermark.my.pdf, with every page overlaid with first page from single_page.pdf. If -u is selected, watermark will be placed underneath page (painted first). -NB: At one point, this example was extremely complicated, with - multiple options. That only led to errors in implementation, - so it has been re-simplified in order to show basic principles - of the library operation and to match the other examples better. +NOTE 1: This program assumes that all pages (including the watermark + page) are the same size. For other possibilities, see + the fancy_watermark.py example. + +NOTE 2: At one point, this example was extremely complicated, with + multiple options. That only led to errors in implementation, + so it has been re-simplified in order to show basic principles + of the library operation and to match the other examples better. ''' import sys @@ -30,4 +34,4 @@ wmark = PageMerge().add(PdfReader(wmarkfn).pages[0])[0] trailer = PdfReader(inpfn) for page in trailer.pages: PageMerge(page).add(wmark, prepend=underneath).render() -PdfWriter().write(outfn, trailer) +PdfWriter(outfn, trailer=trailer).write() diff --git a/pdfrw/__init__.py b/pdfrw/__init__.py index a36a8cb..cf7644a 100644 --- a/pdfrw/__init__.py +++ b/pdfrw/__init__.py @@ -10,13 +10,14 @@ from .tokens import PdfTokens from .errors import PdfParseError from .pagemerge import PageMerge -__version__ = '0.2' +__version__ = '0.4' # Add a tiny bit of compatibility to pyPdf PdfFileReader = PdfReader PdfFileWriter = PdfWriter -__all__ = [PdfWriter, PdfReader, PdfObject, PdfName, PdfArray, - PdfTokens, PdfParseError, PdfDict, IndirectPdfDict, - PdfString, PageMerge] +__all__ = """PdfWriter PdfReader PdfObject PdfName PdfArray + PdfTokens PdfParseError PdfDict IndirectPdfDict + PdfString PageMerge""".split() + diff --git a/pdfrw/buildxobj.py b/pdfrw/buildxobj.py index d210c67..f132795 100644 --- a/pdfrw/buildxobj.py +++ b/pdfrw/buildxobj.py @@ -32,6 +32,8 @@ from .objects import PdfDict, PdfArray, PdfName from .pdfreader import PdfReader from .errors import log, PdfNotImplementedError from .py23_diffs import iteritems +from .uncompress import uncompress +from .compress import compress class ViewInfo(object): @@ -169,6 +171,10 @@ def _build_cache(contents, allow_compressed): and save it along with private cache info. Assumes validity has been pre-checked if we have a non-None xobj_copy. + + Also, the spec says nothing about nested arrays, + so we assume those don't exist until we see one + in the wild. ''' try: xobj_copy = contents.xobj_copy @@ -183,9 +189,20 @@ def _build_cache(contents, allow_compressed): array = [contents] private = contents.private - # The spec says nothing about nested arrays. Will - # assume that's not a problem until we encounter them... + # If we don't allow compressed objects, OR if we have multiple compressed + # objects, we try to decompress them, and fail if we cannot do that. + if not allow_compressed or len(array) > 1: + keys = set(x[0] for cdict in array for x in iteritems(cdict)) + was_compressed = len(keys) > 1 + if was_compressed: + # Make copies of the objects before we uncompress them. + array = [PdfDict(x) for x in array] + if not uncompress(array): + raise PdfNotImplementedError( + 'Xobjects with these compression parameters not supported: %s' % + keys) + xobj_copy = PdfDict(array[0]) xobj_copy.private.xobj_cachedict = {} private.xobj_copy = xobj_copy @@ -195,19 +212,9 @@ def _build_cache(contents, allow_compressed): newlength = sum(int(x.Length) for x in array) + len(array) - 1 assert newlength == len(newstream) xobj_copy.stream = newstream + if was_compressed and allow_compressed: + compress(xobj_copy) - # Cannot currently cope with different kinds of - # compression in the array, so just disallow it. - allow_compressed = False - - if not allow_compressed: - # Make sure there are no compression parameters - for cdict in array: - keys = [x[0] for x in iteritems(cdict)] - if len(keys) != 1: - raise PdfNotImplementedError( - 'Xobjects with compression parameters not supported: %s' % - keys) return xobj_copy diff --git a/pdfrw/compress.py b/pdfrw/compress.py index 0479131..b7b4e75 100644 --- a/pdfrw/compress.py +++ b/pdfrw/compress.py @@ -3,14 +3,14 @@ # MIT license -- See LICENSE.txt for details ''' -Currently, this sad little file only knows how to decompress +Currently, this sad little file only knows how to compress using the flate (zlib) algorithm. Maybe more later, but it's not a priority for me... ''' from .objects import PdfName from .uncompress import streamobjects -from .py23_diffs import zlib +from .py23_diffs import zlib, convert_load, convert_store def compress(mylist): @@ -20,7 +20,7 @@ def compress(mylist): if ftype is not None: continue oldstr = obj.stream - newstr = zlib.compress(oldstr) + newstr = convert_load(zlib.compress(convert_store(oldstr))) if len(newstr) < len(oldstr) + 30: obj.stream = newstr obj.Filter = flate diff --git a/pdfrw/crypt.py b/pdfrw/crypt.py new file mode 100644 index 0000000..dc00676 --- /dev/null +++ b/pdfrw/crypt.py @@ -0,0 +1,150 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2017 Jon Lund Steffensen +# MIT license -- See LICENSE.txt for details + +from __future__ import division + +import hashlib +import struct + +try: + from Crypto.Cipher import ARC4, AES + HAS_CRYPTO = True +except ImportError: + HAS_CRYPTO = False + +from .objects import PdfDict, PdfName + +_PASSWORD_PAD = ( + '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08' + '..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz') + + +def streamobjects(mylist, isinstance=isinstance, PdfDict=PdfDict): + for obj in mylist: + if isinstance(obj, PdfDict) and obj.stream is not None: + yield obj + + +def create_key(password, doc): + """Create an encryption key (Algorithm 2 in PDF spec).""" + key_size = int(doc.Encrypt.Length or 40) // 8 + padded_pass = (password + _PASSWORD_PAD)[:32] + hasher = hashlib.md5() + hasher.update(padded_pass) + hasher.update(doc.Encrypt.O.to_bytes()) + hasher.update(struct.pack('= 3: + for _ in range(50): + temp_hash = hashlib.md5(temp_hash[:key_size]).digest() + + return temp_hash[:key_size] + + +def create_user_hash(key, doc): + """Create the user password hash (Algorithm 4/5).""" + revision = int(doc.Encrypt.R or 0) + if revision < 3: + cipher = ARC4.new(key) + return cipher.encrypt(_PASSWORD_PAD) + else: + hasher = hashlib.md5() + hasher.update(_PASSWORD_PAD) + hasher.update(doc.ID[0].to_bytes()) + temp_hash = hasher.digest() + + for i in range(20): + temp_key = ''.join(chr(i ^ ord(x)) for x in key) + cipher = ARC4.new(temp_key) + temp_hash = cipher.encrypt(temp_hash) + + return temp_hash + + +def check_user_password(key, doc): + """Check that the user password is correct (Algorithm 6).""" + expect_user_hash = create_user_hash(key, doc) + revision = int(doc.Encrypt.R or 0) + if revision < 3: + return doc.Encrypt.U.to_bytes() == expect_user_hash + else: + return doc.Encrypt.U.to_bytes()[:16] == expect_user_hash + + +class AESCryptFilter(object): + """Crypt filter corresponding to /AESV2.""" + def __init__(self, key): + self._key = key + + def decrypt_data(self, num, gen, data): + """Decrypt data (string/stream) using key (Algorithm 1).""" + key_extension = struct.pack('= 1 and ftype[0] == PdfName.Crypt: + ftype = ftype[1:] + parms = obj.DecodeParms or obj.DP + filter = filters[parms.Name] + + num, gen = obj.indirect + obj.stream = filter.decrypt_data(num, gen, obj.stream) + obj.private.decrypted = True + obj.Filter = ftype or None diff --git a/pdfrw/errors.py b/pdfrw/errors.py index 263cd4d..ef6ab7d 100644 --- a/pdfrw/errors.py +++ b/pdfrw/errors.py @@ -9,11 +9,14 @@ PDF Exceptions and error handling import logging -logging.basicConfig( - format='[%(levelname)s] %(filename)s:%(lineno)d %(message)s', - level=logging.WARNING) +fmt = logging.Formatter('[%(levelname)s] %(filename)s:%(lineno)d %(message)s') + +handler = logging.StreamHandler() +handler.setFormatter(fmt) log = logging.getLogger('pdfrw') +log.setLevel(logging.WARNING) +log.addHandler(handler) class PdfError(Exception): diff --git a/pdfrw/findobjs.py b/pdfrw/findobjs.py index f19ebdf..67d33a0 100644 --- a/pdfrw/findobjs.py +++ b/pdfrw/findobjs.py @@ -8,7 +8,6 @@ ''' from .objects import PdfDict, PdfArray, PdfName -from .pdfwriter import user_fmt def find_objects(source, valid_types=(PdfName.XObject, None), @@ -81,7 +80,7 @@ def wrap_object(obj, width, margin): iw, ih = float(obj.Width), float(obj.Height) ch = 1.0 * cw / iw * ih height = ch + margin[1] + margin[3] - p = tuple(user_fmt(x) for x in (cw, ch, xoffset, yoffset)) + p = tuple(('%.9f' % x).rstrip('0').rstrip('.') for x in (cw, ch, xoffset, yoffset)) contents.stream = fmt % p resources = PdfDict(XObject=PdfDict(MyImage=obj)) mbox = PdfArray((0, 0, width, height)) diff --git a/pdfrw/objects/__init__.py b/pdfrw/objects/__init__.py index 1746dfe..879e0ef 100644 --- a/pdfrw/objects/__init__.py +++ b/pdfrw/objects/__init__.py @@ -15,5 +15,5 @@ from .pdfobject import PdfObject from .pdfstring import PdfString from .pdfindirect import PdfIndirect -__all__ = [PdfName, PdfDict, IndirectPdfDict, PdfArray, - PdfObject, PdfString, PdfIndirect] +__all__ = """PdfName PdfDict IndirectPdfDict PdfArray + PdfObject PdfString PdfIndirect""".split() diff --git a/pdfrw/objects/pdfarray.py b/pdfrw/objects/pdfarray.py index b662755..e15f4ad 100644 --- a/pdfrw/objects/pdfarray.py +++ b/pdfrw/objects/pdfarray.py @@ -65,3 +65,7 @@ class PdfArray(list): def pop(self, *args): self._resolve() return list.pop(self, *args) + + def __reversed__(self): + self._resolve() + return list.__reversed__(self) diff --git a/pdfrw/objects/pdfdict.py b/pdfrw/objects/pdfdict.py index fc28492..0fdf75b 100644 --- a/pdfrw/objects/pdfdict.py +++ b/pdfrw/objects/pdfdict.py @@ -136,7 +136,15 @@ class PdfDict(dict): ''' value = dictget(self, key) if isinstance(value, PdfIndirect): - self[key] = value = value.real_value() + # We used to use self[key] here, but that does an + # unwanted check on the type of the key (github issue #98). + # Python will keep the old key object in the dictionary, + # so that check is not necessary. + value = value.real_value() + if value is not None: + dict.__setitem__(self, key, value) + else: + del self[name] return value def __getitem__(self, key): diff --git a/pdfrw/objects/pdfname.py b/pdfrw/objects/pdfname.py index 1fdf5b5..28a1464 100644 --- a/pdfrw/objects/pdfname.py +++ b/pdfrw/objects/pdfname.py @@ -23,6 +23,7 @@ class BasePdfName(str): ''' indirect = False + encoded = None whitespace = '\x00 \t\f\r\n' delimiters = '()<>{}[]/%' diff --git a/pdfrw/objects/pdfstring.py b/pdfrw/objects/pdfstring.py index 5c35d70..906f30e 100644 --- a/pdfrw/objects/pdfstring.py +++ b/pdfrw/objects/pdfstring.py @@ -1,74 +1,553 @@ # A part of pdfrw (https://github.com/pmaupin/pdfrw) -# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas +# 2016 James Laird-Wah, Sydney, Australia # MIT license -- See LICENSE.txt for details -import re +""" +================================ +PdfString encoding and decoding +================================ + +Introduction +============= + + +This module handles encoding and decoding of PDF strings. PDF strings +are described in the PDF 1.7 reference manual, mostly in chapter 3 +(sections 3.2 and 3.8) and chapter 5. + +PDF strings are used in the document structure itself, and also inside +the stream of page contents dictionaries. + +A PDF string can represent pure binary data (e.g. for a font or an +image), or text, or glyph indices. For Western fonts, the glyph indices +usually correspond to ASCII, but that is not guaranteed. (When it does +happen, it makes examination of raw PDF data a lot easier.) + +The specification defines PDF string encoding at two different levels. +At the bottom, it defines ways to encode arbitrary bytes so that a PDF +tokenizer can understand they are a string of some sort, and can figure +out where the string begins and ends. (That is all the tokenizer itself +cares about.) Above that level, if the string represents text, the +specification defines ways to encode Unicode text into raw bytes, before +the byte encoding is performed. + +There are two ways to do the byte encoding, and two ways to do the text +(Unicode) encoding. + +Encoding bytes into PDF strings +================================ + +Adobe calls the two ways to encode bytes into strings "Literal strings" +and "Hexadecimal strings." + +Literal strings +------------------ + +A literal string is delimited by ASCII parentheses ("(" and ")"), and a +hexadecimal string is delimited by ASCII less-than and greater-than +signs ("<" and ">"). + +A literal string may encode bytes almost unmolested. The caveat is +that if a byte has the same value as a parenthesis, it must be escaped +so that the tokenizer knows the string is not finished. This is accomplished +by using the ASCII backslash ("\") as an escape character. Of course, +now any backslash appearing in the data must likewise be escaped. + +Hexadecimal strings +--------------------- + +A hexadecimal string requires twice as much space as the source data +it represents (plus two bytes for the delimiter), simply storing each +byte as two hexadecimal digits, most significant digit first. The spec +allows for lower or upper case hex digits, but most PDF encoders seem +to use upper case. + +Special cases -- Legacy systems and readability +----------------------------------------------- + +It is possible to create a PDF document that uses 7 bit ASCII encoding, +and it is desirable in many cases to create PDFs that are reasonably +readable when opened in a text editor. For these reasons, the syntax +for both literal strings and hexadecimal strings is slightly more +complicated that the initial description above. In general, the additional +syntax allows the following features: + + - Making the delineation between characters, or between sections of + a string, apparent, and easy to see in an editor. + - Keeping output lines from getting too wide for some editors + - Keeping output lines from being so narrow that you can only see the + small fraction of a string at a time in an editor. + - Suppressing unprintable characters + - Restricting the output string to 7 bit ASCII + +Hexadecimal readability +~~~~~~~~~~~~~~~~~~~~~~~ + +For hexadecimal strings, only the first two bullets are relevant. The syntax +to accomplish this is simple, allowing any ASCII whitespace to be inserted +anywhere in the encoded hex string. + +Literal readability +~~~~~~~~~~~~~~~~~~~ + +For literal strings, all of the bullets except the first are relevant. +The syntax has two methods to help with these goals. The first method +is to overload the escape operator to be able to do different functions, +and the second method can reduce the number of escapes required for +parentheses in the normal case. + +The escape function works differently, depending on what byte follows +the backslash. In all cases, the escaping backslash is discarded, +and then the next character is examined: + + - For parentheses and backslashes (and, in fact, for all characters + not described otherwise in this list), the character after the + backslash is preserved in the output. + - A letter from the set of "nrtbf" following a backslash is interpreted as + a line feed, carriage return, tab, backspace, or form-feed, respectively. + - One to three octal digits following the backslash indicate the + numeric value of the encoded byte. + - A carriage return, carriage return/line feed, or line feed following + the backslash indicates a line break that was put in for readability, + and that is not part of the actual data, so this is discarded. + +The second method that can be used to improve readability (and reduce space) +in literal strings is to not escape parentheses. This only works, and is +only allowed, when the parentheses are properly balanced. For example, +"((Hello))" is a valid encoding for a literal string, but "((Hello)" is not; +the latter case should be encoded "(\(Hello)" + +Encoding text into strings +========================== + +Section 3.8.1 of the PDF specification describes text strings. + +The individual characters of a text string can all be considered to +be Unicode; Adobe specifies two different ways to encode these characters +into a string of bytes before further encoding the byte string as a +literal string or a hexadecimal string. + +The first way to encode these strings is called PDFDocEncoding. This +is mostly a one-for-one mapping of bytes into single bytes, similar to +Latin-1. The representable character set is limited to the number of +characters that can fit in a byte, and this encoding cannot be used +with Unicode strings that start with the two characters making up the +UTF-16-BE BOM. + +The second way to encode these strings is with UTF-16-BE. Text strings +encoded with this method must start with the BOM, and although the spec +does not appear to mandate that the resultant bytes be encoded into a +hexadecimal string, that seems to be the canonical way to do it. + +When encoding a string into UTF-16-BE, this module always adds the BOM, +and when decoding a string from UTF-16-BE, this module always strips +the BOM. If a source string contains a BOM, that will remain in the +final string after a round-trip through the encoder and decoder, as +the goal of the encoding/decoding process is transparency. + + +PDF string handling in pdfrw +============================= + +Responsibility for handling PDF strings in the pdfrw library is shared +between this module, the tokenizer, and the pdfwriter. + +tokenizer string handling +-------------------------- + +As far as the tokenizer and its clients such as the pdfreader are concerned, +the PdfString class must simply be something that it can instantiate by +passing a string, that doesn't compare equal (or throw an exception when +compared) to other possible token strings. The tokenizer must understand +enough about the syntax of the string to successfully find its beginning +and end in a stream of tokens, but doesn't otherwise know or care about +the data represented by the string. + +pdfwriter string handling +-------------------------- + +The pdfwriter knows and cares about two attributes of PdfString instances: + + - First, PdfString objects have an 'indirect' attribute, which pdfwriter + uses as an indication that the object knows how to represent itself + correctly when output to a new PDF. (In the case of a PdfString object, + no work is really required, because it is already a string.) + - Second, the PdfString.encode() method is used as a convenience to + automatically convert any user-supplied strings (that didn't come + from PDFs) when a PDF is written out to a file. + +pdfstring handling +------------------- + +The code in this module is designed to support those uses by the +tokenizer and the pdfwriter, and to additionally support encoding +and decoding of PdfString objects as a convenience for the user. + +Most users of the pdfrw library never encode or decode a PdfString, +so it is imperative that (a) merely importing this module does not +take a significant amount of CPU time; and (b) it is cheap for the +tokenizer to produce a PdfString, and cheap for the pdfwriter to +consume a PdfString -- if the tokenizer finds a string that conforms +to the PDF specification, it will be wrapped in a PdfString object, +and if the pdfwriter finds an object with an indirect attribute, it +simply calls str() to ask it to format itself. + +Encoding and decoding are not actually performed very often at all, +compared to how often tokenization and then subsequent concatenation +by the pdfwriter are performed. In fact, versions of pdfrw prior to +0.4 did not even support Unicode for this function. Encoding and +decoding can also easily be performed by the user, outside of the +library, and this might still be recommended, at least for encoding, +if the visual appeal of encodings generated by this module is found +lacking. + + +Decoding strings +~~~~~~~~~~~~~~~~~~~ + +Decoding strings can be tricky, but is a bounded process. Each +properly-encoded encoded string represents exactly one output string, +with the caveat that is up to the caller of the function to know whether +he expects a Unicode string, or just bytes. + +The caller can call PdfString.to_bytes() to get a byte string (which may +or may not represent encoded Unicode), or may call PdfString.to_unicode() +to get a Unicode string. Byte strings will be regular strings in Python 2, +and b'' bytes in Python 3; Unicode strings will be regular strings in +Python 3, and u'' unicode strings in Python 2. + +To maintain application compatibility with earlier versions of pdfrw, +PdfString.decode() is an alias for PdfString.to_unicode(). + +Encoding strings +~~~~~~~~~~~~~~~~~~ + +PdfString has three factory functions that will encode strings into +PdfString objects: + + - PdfString.from_bytes() accepts a byte string (regular string in Python 2 + or b'' bytes string in Python 3) and returns a PdfString object. + - PdfString.from_unicode() accepts a Unicode string (u'' Unicode string in + Python 2 or regular string in Python 3) and returns a PdfString object. + - PdfString.encode() examines the type of object passed, and either + calls from_bytes() or from_unicode() to do the real work. + +Unlike decoding(), encoding is not (mathematically) a function. +There are (literally) an infinite number of ways to encode any given +source string. (Of course, most of them would be stupid, unless +the intent is some sort of denial-of-service attack.) + +So encoding strings is either simpler than decoding, or can be made to +be an open-ended science fair project (to create the best looking +encoded strings). + +There are parameters to the encoding functions that allow control over +the final encoded string, but the intention is to make the default values +produce a reasonable encoding. + +As mentioned previously, if encoding does not do what a particular +user needs, that user is free to write his own encoder, and then +simply instantiate a PdfString object by passing a string to the +default constructor, the same way that the tokenizer does it. + +However, if desirable, encoding may gradually become more capable +over time, adding the ability to generate more aesthetically pleasing +encoded strings. + +PDFDocString encoding and decoding +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To handle this encoding in a fairly standard way, this module registers +an encoder and decoder for PDFDocEncoding with the codecs module. + +""" + +import re +import codecs +import binascii +import itertools +from ..py23_diffs import convert_load, convert_store + +def find_pdfdocencoding(encoding): + """ This function conforms to the codec module registration + protocol. It defers calculating data structures until + a pdfdocencoding encode or decode is required. + + PDFDocEncoding is described in the PDF 1.7 reference manual. + """ + + if encoding != 'pdfdocencoding': + return + + # Create the decoding map based on the table in section D.2 of the + # PDF 1.7 manual + + # Start off with the characters with 1:1 correspondence + decoding_map = set(range(0x20, 0x7F)) | set(range(0xA1, 0x100)) + decoding_map.update((0x09, 0x0A, 0x0D)) + decoding_map.remove(0xAD) + decoding_map = dict((x, x) for x in decoding_map) + + # Add in the special Unicode characters + decoding_map.update(zip(range(0x18, 0x20), ( + 0x02D8, 0x02C7, 0x02C6, 0x02D9, 0x02DD, 0x02DB, 0x02DA, 0x02DC))) + decoding_map.update(zip(range(0x80, 0x9F), ( + 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044, + 0x2039, 0x203A, 0x2212, 0x2030, 0x201E, 0x201C, 0x201D, 0x2018, + 0x2019, 0x201A, 0x2122, 0xFB01, 0xFB02, 0x0141, 0x0152, 0x0160, + 0x0178, 0x017D, 0x0131, 0x0142, 0x0153, 0x0161, 0x017E))) + decoding_map[0xA0] = 0x20AC + + # Make the encoding map from the decoding map + encoding_map = codecs.make_encoding_map(decoding_map) + + # Not every PDF producer follows the spec, so conform to Postel's law + # and interpret encoded strings if at all possible. In particular, they + # might have nulls and form-feeds, judging by random code snippets + # floating around the internet. + decoding_map.update(((x, x) for x in range(0x18))) + + def encode(input, errors='strict'): + return codecs.charmap_encode(input, errors, encoding_map) + + def decode(input, errors='strict'): + return codecs.charmap_decode(input, errors, decoding_map) + + return codecs.CodecInfo(encode, decode, name='pdfdocencoding') + +codecs.register(find_pdfdocencoding) class PdfString(str): - ''' A PdfString is an encoded string. It has a decode + """ A PdfString is an encoded string. It has a decode method to get the actual string data out, and there is an encode class method to create such a string. Like any PDF object, it could be indirect, but it defaults to being a direct object. - ''' + """ indirect = False - unescape_dict = {'\\b': '\b', '\\f': '\f', '\\n': '\n', - '\\r': '\r', '\\t': '\t', - '\\\r\n': '', '\\\r': '', '\\\n': '', - '\\\\': '\\', '\\': '', - } - unescape_pattern = (r'(\\\\|\\b|\\f|\\n|\\r|\\t' - r'|\\\r\n|\\\r|\\\n|\\[0-9]+|\\)') - unescape_func = re.compile(unescape_pattern).split - hex_pattern = '([a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])' - hex_func = re.compile(hex_pattern).split - hex_pattern2 = ('([a-fA-F0-9][a-fA-F0-9][a-fA-F0-9][a-fA-F0-9]|' - '[a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])') - hex_func2 = re.compile(hex_pattern2).split + # The byte order mark, and unicode that could be + # wrongly encoded into the byte order mark by the + # pdfdocencoding codec. - hex_funcs = hex_func, hex_func2 + bytes_bom = codecs.BOM_UTF16_BE + bad_pdfdoc_prefix = bytes_bom.decode('latin-1') - def decode_regular(self, remap=chr): - assert self[0] == '(' and self[-1] == ')' - mylist = self.unescape_func(self[1:-1]) - result = [] - unescape = self.unescape_dict.get - for chunk in mylist: - chunk = unescape(chunk, chunk) - if chunk.startswith('\\') and len(chunk) > 1: - value = int(chunk[1:], 8) - # FIXME: TODO: Handle unicode here - if value > 127: - value = 127 - chunk = remap(value) - if chunk: - result.append(chunk) - return ''.join(result) + # Used by decode_literal; filled in on first use - def decode_hex(self, remap=chr, twobytes=False): - data = ''.join(self.split()) - data = self.hex_funcs[twobytes](data) - chars = data[1::2] - other = data[0::2] - assert (other[0] == '<' and - other[-1] == '>' and - ''.join(other) == '<>'), self - return ''.join([remap(int(x, 16)) for x in chars]) + unescape_dict = None + unescape_func = None - def decode(self, remap=chr, twobytes=False): - if self.startswith('('): - return self.decode_regular(remap) + @classmethod + def init_unescapes(cls): + """ Sets up the unescape attributes for decode_literal + """ + unescape_pattern = r'\\([0-7]{1,3}|\r\n|.)' + unescape_func = re.compile(unescape_pattern, re.DOTALL).split + cls.unescape_func = unescape_func + + unescape_dict = dict(((chr(x), chr(x)) for x in range(0x100))) + unescape_dict.update(zip('nrtbf', '\n\r\t\b\f')) + unescape_dict['\r'] = '' + unescape_dict['\n'] = '' + unescape_dict['\r\n'] = '' + for i in range(0o10): + unescape_dict['%01o' % i] = chr(i) + for i in range(0o100): + unescape_dict['%02o' % i] = chr(i) + for i in range(0o400): + unescape_dict['%03o' % i] = chr(i) + cls.unescape_dict = unescape_dict + return unescape_func + + def decode_literal(self): + """ Decode a PDF literal string, which is enclosed in parentheses () + + Many pdfrw users never decode strings, so defer creating + data structures to do so until the first string is decoded. + + Possible string escapes from the spec: + (PDF 1.7 Reference, section 3.2.3, page 53) + + 1. \[nrtbf\()]: simple escapes + 2. \\d{1,3}: octal. Must be zero-padded to 3 digits + if followed by digit + 3. \: line continuation. We don't know the EOL + marker used in the PDF, so accept \r, \n, and \r\n. + 4. Any other character following \ escape -- the backslash + is swallowed. + """ + result = (self.unescape_func or self.init_unescapes())(self[1:-1]) + if len(result) == 1: + return convert_store(result[0]) + unescape_dict = self.unescape_dict + result[1::2] = [unescape_dict[x] for x in result[1::2]] + return convert_store(''.join(result)) + + + def decode_hex(self): + """ Decode a PDF hexadecimal-encoded string, which is enclosed + in angle brackets <>. + """ + hexstr = convert_store(''.join(self[1:-1].split())) + if len(hexstr) % 1: # odd number of chars indicates a truncated 0 + hexstr += '0' + return binascii.unhexlify(hexstr) + + + def to_bytes(self): + """ Decode a PDF string to bytes. This is a convenience function + for user code, in that (as of pdfrw 0.3) it is never + actually used inside pdfrw. + """ + if self.startswith('(') and self.endswith(')'): + return self.decode_literal() + + elif self.startswith('<') and self.endswith('>'): + return self.decode_hex() else: - return self.decode_hex(remap, twobytes) + raise ValueError('Invalid PDF string "%s"' % repr(self)) - def encode(cls, source, usehex=False): - assert not usehex, "Not supported yet" - source = source.replace('\\', '\\\\') - source = source.replace('(', '\\(') - source = source.replace(')', '\\)') - return cls('(' + source + ')') - encode = classmethod(encode) + def to_unicode(self): + """ Decode a PDF string to a unicode string. This is a + convenience function for user code, in that (as of + pdfrw 0.3) it is never actually used inside pdfrw. + + There are two Unicode storage methods used -- either + UTF16_BE, or something called PDFDocEncoding, which + is defined in the PDF spec. The determination of + which decoding method to use is done by examining the + first two bytes for the byte order marker. + """ + raw = self.to_bytes() + + if raw[:2] == self.bytes_bom: + return raw[2:].decode('utf-16-be') + else: + return raw.decode('pdfdocencoding') + + # Legacy-compatible interface + decode = to_unicode + + # Internal value used by encoding + + escape_splitter = None # Calculated on first use + + @classmethod + def init_escapes(cls): + """ Initialize the escape_splitter for the encode method + """ + cls.escape_splitter = re.compile(br'(\(|\\|\))').split + return cls.escape_splitter + + @classmethod + def from_bytes(cls, raw, bytes_encoding='auto'): + """ The from_bytes() constructor is called to encode a source raw + byte string into a PdfString that is suitable for inclusion + in a PDF. + + NOTE: There is no magic in the encoding process. A user + can certainly do his own encoding, and simply initialize a + PdfString() instance with his encoded string. That may be + useful, for example, to add line breaks to make it easier + to load PDFs into editors, or to not bother to escape balanced + parentheses, or to escape additional characters to make a PDF + more readable in a file editor. Those are features not + currently supported by this method. + + from_bytes() can use a heuristic to figure out the best + encoding for the string, or the user can control the process + by changing the bytes_encoding parameter to 'literal' or 'hex' + to force a particular conversion method. + """ + + # If hexadecimal is not being forced, then figure out how long + # the escaped literal string will be, and fall back to hex if + # it is too long. + + force_hex = bytes_encoding == 'hex' + if not force_hex: + if bytes_encoding not in ('literal', 'auto'): + raise ValueError('Invalid bytes_encoding value: %s' + % bytes_encoding) + splitlist = (cls.escape_splitter or cls.init_escapes())(raw) + if bytes_encoding == 'auto' and len(splitlist) // 2 >= len(raw): + force_hex = True + + if force_hex: + # The spec does not mandate uppercase, + # but it seems to be the convention. + fmt = '<%s>' + result = binascii.hexlify(raw).upper() + else: + fmt = '(%s)' + splitlist[1::2] = [(b'\\' + x) for x in splitlist[1::2]] + result = b''.join(splitlist) + + return cls(fmt % convert_load(result)) + + @classmethod + def from_unicode(cls, source, text_encoding='auto', + bytes_encoding='auto'): + """ The from_unicode() constructor is called to encode a source + string into a PdfString that is suitable for inclusion in a PDF. + + NOTE: There is no magic in the encoding process. A user + can certainly do his own encoding, and simply initialize a + PdfString() instance with his encoded string. That may be + useful, for example, to add line breaks to make it easier + to load PDFs into editors, or to not bother to escape balanced + parentheses, or to escape additional characters to make a PDF + more readable in a file editor. Those are features not + supported by this method. + + from_unicode() can use a heuristic to figure out the best + encoding for the string, or the user can control the process + by changing the text_encoding parameter to 'pdfdocencoding' + or 'utf16', and/or by changing the bytes_encoding parameter + to 'literal' or 'hex' to force particular conversion methods. + + The function will raise an exception if it cannot perform + the conversion as requested by the user. + """ + + # Give preference to pdfdocencoding, since it only + # requires one raw byte per character, rather than two. + if text_encoding != 'utf16': + force_pdfdoc = text_encoding == 'pdfdocencoding' + if text_encoding != 'auto' and not force_pdfdoc: + raise ValueError('Invalid text_encoding value: %s' + % text_encoding) + + if source.startswith(cls.bad_pdfdoc_prefix): + if force_pdfdoc: + raise UnicodeError('Prefix of string %r cannot be encoded ' + 'in pdfdocencoding' % source[:20]) + else: + try: + raw = source.encode('pdfdocencoding') + except UnicodeError: + if force_pdfdoc: + raise + else: + return cls.from_bytes(raw, bytes_encoding) + + # If the user is not forcing literal strings, + # it makes much more sense to use hexadecimal with 2-byte chars + raw = cls.bytes_bom + source.encode('utf-16-be') + encoding = 'hex' if bytes_encoding == 'auto' else bytes_encoding + return cls.from_bytes(raw, encoding) + + @classmethod + def encode(cls, source, uni_type = type(u''), isinstance=isinstance): + """ The encode() constructor is a legacy function that is + also a convenience for the PdfWriter. + """ + if isinstance(source, uni_type): + return cls.from_unicode(source) + else: + return cls.from_bytes(source) diff --git a/pdfrw/pagemerge.py b/pdfrw/pagemerge.py index de1d69b..4555110 100644 --- a/pdfrw/pagemerge.py +++ b/pdfrw/pagemerge.py @@ -176,8 +176,8 @@ class PageMerge(list): return self def render(self): - def do_xobjs(xobj_list): - content = [] + def do_xobjs(xobj_list, restore_first=False): + content = ['Q'] if restore_first else [] for obj in xobj_list: index = PdfName('pdfrw_%d' % (key_offset + len(xobjs))) if xobjs.setdefault(index, obj) is not obj: @@ -199,9 +199,9 @@ class PageMerge(list): allkeys = xobjs.keys() if allkeys: keys = (x for x in allkeys if x.startswith('/pdfrw_')) - keys = (x for x in keys if x[6:].isdigit()) - keys = sorted(keys, key=lambda x: int(x[6:])) - key_offset = (int(keys[-1][6:]) + 1) if keys else 0 + keys = (x for x in keys if x[7:].isdigit()) + keys = sorted(keys, key=lambda x: int(x[7:])) + key_offset = (int(keys[-1][7:]) + 1) if keys else 0 key_offset -= len(allkeys) if old_contents is None: @@ -213,10 +213,18 @@ class PageMerge(list): index = self.index(None) if index: new_contents.append(do_xobjs(self[:index])) - new_contents.extend(old_contents) + index += 1 if index < len(self): - new_contents.append(do_xobjs(self[index:])) + # There are elements to add after the original page contents, + # so push the graphics state to the stack. Restored below. + new_contents.append(PdfDict(indirect=True, stream='q')) + + new_contents.extend(old_contents) + + if index < len(self): + # Restore graphics state and add other elements. + new_contents.append(do_xobjs(self[index:], restore_first=True)) if mbox is None: cbox = None diff --git a/pdfrw/pdfreader.py b/pdfrw/pdfreader.py index 0baf0eb..c2ae030 100644 --- a/pdfrw/pdfreader.py +++ b/pdfrw/pdfreader.py @@ -19,7 +19,8 @@ from .errors import PdfParseError, log from .tokens import PdfTokens from .objects import PdfDict, PdfArray, PdfName, PdfObject, PdfIndirect from .uncompress import uncompress -from .py23_diffs import convert_load, iteritems +from . import crypt +from .py23_diffs import convert_load, convert_store, iteritems class PdfReader(PdfDict): @@ -265,8 +266,17 @@ class PdfReader(PdfDict): for key in new: self.loadindirect(key) + def decrypt_all(self): + self.read_all() + + if self.crypt_filters is not None: + crypt.decrypt_objects( + self.indirect_objects.values(), self.stream_crypt_filter, + self.crypt_filters) + def uncompress(self): self.read_all() + uncompress(self.indirect_objects.values()) def load_stream_objects(self, object_streams): @@ -279,22 +289,26 @@ class PdfReader(PdfDict): # read objects from stream if objs: + # Decrypt + if self.crypt_filters is not None: + crypt.decrypt_objects( + objs, self.stream_crypt_filter, self.crypt_filters) + + # Decompress uncompress(objs) + for obj in objs: objsource = PdfTokens(obj.stream, 0, False) - snext = objsource.next - offsets = {} + next = objsource.next + offsets = [] firstoffset = int(obj.First) - num = snext() - while num.isdigit(): - offset = int(snext()) - offsets[int(num)] = firstoffset + offset - num = snext() - for num, offset in iteritems(offsets): + while objsource.floc < firstoffset: + offsets.append((int(next()), firstoffset + int(next()))) + for num, offset in offsets: # Read the object, and call special code if it starts # an array or dictionary objsource.floc = offset - sobj = snext() + sobj = next() func = self.special.get(sobj) if func is not None: sobj = func(objsource) @@ -332,7 +346,6 @@ class PdfReader(PdfDict): ''' def readint(s, lengths): - lengths = itertools.cycle(lengths) offset = 0 for length in itertools.cycle(lengths): next = offset + length @@ -354,8 +367,13 @@ class PdfReader(PdfDict): source.exception('Expected dict type of /XRef') tok = next() self.readstream(obj, self.findstream(obj, tok, source), source, True) + old_strm = obj.stream if not uncompress([obj], True): source.exception('Could not decompress Xref stream') + stream = obj.stream + # Fix for issue #76 -- goofy compressed xref stream + # that is NOT ACTUALLY COMPRESSED + stream = stream if stream is not old_strm else convert_store(old_strm) num_pairs = obj.Index or PdfArray(['0', obj.Size]) num_pairs = [int(x) for x in num_pairs] num_pairs = zip(num_pairs[0::2], num_pairs[1::2]) @@ -363,7 +381,7 @@ class PdfReader(PdfDict): if len(entry_sizes) != 3: source.exception('Invalid entry size') object_streams = defaultdict(list) - get = readint(obj.stream, entry_sizes) + get = readint(stream, entry_sizes) for objnum, size in num_pairs: for cnt in range(size): xtype, p1, p2 = islice(get, 3) @@ -431,7 +449,10 @@ class PdfReader(PdfDict): ''' Parse (one of) the cross-reference file section(s) ''' next = source.next - tok = next() + try: + tok = next() + except StopIteration: + tok = '' if tok.isdigit(): return self.parse_xref_stream(source), True elif tok == 'xref': @@ -450,36 +471,92 @@ class PdfReader(PdfDict): typename = PdfName.Type kidname = PdfName.Kids - # PDFs can have arbitrarily nested Pages/Page - # dictionary structures. - def readnode(node): - nodetype = node[typename] - if nodetype == pagename: - yield node - elif nodetype == pagesname: - for node in node[kidname]: - for node in readnode(node): - yield node - elif nodetype == catalogname: - for node in readnode(node[pagesname]): - yield node - else: - log.error('Expected /Page or /Pages dictionary, got %s' % - repr(node)) try: - return list(readnode(node)) + result = [] + stack = [node] + append = result.append + pop = stack.pop + while stack: + node = pop() + nodetype = node[typename] + if nodetype == pagename: + append(node) + elif nodetype == pagesname: + stack.extend(reversed(node[kidname])) + elif nodetype == catalogname: + stack.append(node[pagesname]) + else: + log.error('Expected /Page or /Pages dictionary, got %s' % + repr(node)) + return result except (AttributeError, TypeError) as s: log.error('Invalid page tree: %s' % s) return [] - def __init__(self, fname=None, fdata=None, decompress=False, - disable_gc=True, verbose=True): + def _parse_encrypt_info(self, source, password, trailer): + """Check password and initialize crypt filters.""" + # Create and check password key + key = crypt.create_key(password, trailer) + if not crypt.check_user_password(key, trailer): + source.warning('User password does not validate') + + # Create default crypt filters + private = self.private + crypt_filters = self.crypt_filters + version = int(trailer.Encrypt.V or 0) + if version in (1, 2): + crypt_filter = crypt.RC4CryptFilter(key) + private.stream_crypt_filter = crypt_filter + private.string_crypt_filter = crypt_filter + elif version == 4: + if PdfName.CF in trailer.Encrypt: + for name, params in iteritems(trailer.Encrypt.CF): + if name == PdfName.Identity: + continue + + cfm = params.CFM + if cfm == PdfName.AESV2: + crypt_filters[name] = crypt.AESCryptFilter(key) + elif cfm == PdfName.V2: + crypt_filters[name] = crypt.RC4CryptFilter(key) + else: + source.warning( + 'Unsupported crypt filter: {}, {}'.format( + name, cfm)) + + # Read default stream filter + if PdfName.StmF in trailer.Encrypt: + name = trailer.Encrypt.StmF + if name in crypt_filters: + private.stream_crypt_filter = crypt_filters[name] + else: + source.warning( + 'Invalid crypt filter name in /StmF:' + ' {}'.format(name)) + + # Read default string filter + if PdfName.StrF in trailer.Encrypt: + name = trailer.Encrypt.StrF + if name in crypt_filters: + private.string_crypt_filter = crypt_filters[name] + else: + source.warning( + 'Invalid crypt filter name in /StrF:' + ' {}'.format(name)) + else: + source.warning( + 'Unsupported Encrypt version: {}'.format(version)) + + def __init__(self, fname=None, fdata=None, decompress=False, + decrypt=False, password='', disable_gc=True, verbose=True): self.private.verbose = verbose + # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() if disable_gc: gc.disable() + try: if fname is not None: assert fdata is None @@ -494,8 +571,10 @@ class PdfReader(PdfDict): except IOError: raise PdfParseError('Could not read PDF file %s' % fname) - fdata = convert_load(fdata) + assert fdata is not None + fdata = convert_load(fdata) + if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: @@ -548,6 +627,23 @@ class PdfReader(PdfDict): xref_list.append((source.obj_offsets, trailer, is_stream)) source.floc = int(prev) + # Handle document encryption + private.crypt_filters = None + if decrypt and PdfName.Encrypt in trailer: + identity_filter = crypt.IdentityCryptFilter() + crypt_filters = { + PdfName.Identity: identity_filter + } + private.crypt_filters = crypt_filters + private.stream_crypt_filter = identity_filter + private.string_crypt_filter = identity_filter + + if not crypt.HAS_CRYPTO: + raise PdfParseError( + 'Install PyCrypto to enable encryption support') + + self._parse_encrypt_info(source, password, trailer) + if is_stream: self.load_stream_objects(trailer.object_streams) @@ -566,6 +662,10 @@ class PdfReader(PdfDict): float(trailer.Version) > float(self.version)): self.private.version = trailer.Version + if decrypt: + self.decrypt_all() + trailer.Encrypt = None + if is_stream: self.Root = trailer.Root self.Info = trailer.Info diff --git a/pdfrw/pdfwriter.py b/pdfrw/pdfwriter.py index 644bb30..3c887ba 100755 --- a/pdfrw/pdfwriter.py +++ b/pdfrw/pdfwriter.py @@ -29,7 +29,7 @@ NullObject.Type = 'Null object' def user_fmt(obj, isinstance=isinstance, float=float, str=str, - basestring=str, encode=PdfString.encode): + basestring=(type(u''), type(b'')), encode=PdfString.encode): ''' This function may be replaced by the user for specialized formatting requirements. ''' @@ -137,11 +137,11 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(), elif isinstance(obj, PdfDict): if compress and obj.stream: do_compress([obj]) - pairs = sorted((x, y, getattr(x, 'encoded', x)) + pairs = sorted((getattr(x, 'encoded', None) or x, y) for (x, y) in obj.iteritems()) myarray = [] - for key, value, encoding in pairs: - myarray.append(encoding) + for key, value in pairs: + myarray.append(key) myarray.append(add(value)) result = format_array(myarray, '<<%s>>') stream = obj.stream @@ -155,7 +155,7 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(), # We assume that an object with an indirect # attribute knows how to represent itself to us. if hasattr(obj, 'indirect'): - return str(getattr(obj, 'encoded', obj)) + return str(getattr(obj, 'encoded', None) or obj) return user_fmt(obj) def format_deferred(): @@ -177,10 +177,10 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(), # Don't reference old catalog or pages objects -- # swap references to new ones. - swapobj = {PdfName.Catalog: trailer.Root, + type_remap = {PdfName.Catalog: trailer.Root, PdfName.Pages: trailer.Root.Pages, None: trailer}.get - swapobj = [(objid, swapobj(obj.Type)) - for objid, obj in iteritems(killobj)] + swapobj = [(objid, type_remap(obj.Type) if new_obj is None else new_obj) + for objid, (obj, new_obj) in iteritems(killobj)] swapobj = dict((objid, obj is None and NullObject or obj) for objid, obj in swapobj).get @@ -225,11 +225,44 @@ class PdfWriter(object): _trailer = None canonicalize = False + fname = None - def __init__(self, version='1.3', compress=False): - self.pagearray = PdfArray() - self.compress = compress + def __init__(self, fname=None, version='1.3', compress=False, **kwargs): + """ + Parameters: + fname -- Output file name, or file-like binary object + with a write method + version -- PDF version to target. Currently only 1.3 + supported. + compress -- True to do compression on output. Currently + compresses stream objects. + """ + + # Legacy support: fname is new, was added in front + if fname is not None: + try: + float(fname) + except (ValueError, TypeError): + pass + else: + if version != '1.3': + assert compress == False + compress = version + version = fname + fname = None + + self.fname = fname self.version = version + self.compress = compress + + if kwargs: + for name, value in iteritems(kwargs): + if name not in self.replaceable: + raise ValueError("Cannot set attribute %s " + "on PdfWriter instance" % name) + setattr(self, name, value) + + self.pagearray = PdfArray() self.killobj = {} def addpage(self, page): @@ -251,13 +284,14 @@ class PdfWriter(object): # Add parents in the hierarchy to objects we # don't want to output killobj = self.killobj - obj = page.Parent + obj, new_obj = page, self.pagearray[-1] while obj is not None: objid = id(obj) if objid in killobj: break - killobj[objid] = obj + killobj[objid] = obj, new_obj obj = obj.Parent + new_obj = None return self addPage = addpage # for compatibility with pyPdf @@ -300,10 +334,18 @@ class PdfWriter(object): trailer = property(_get_trailer, _set_trailer) - def write(self, fname, trailer=None, user_fmt=user_fmt, + def write(self, fname=None, trailer=None, user_fmt=user_fmt, disable_gc=True): + trailer = trailer or self.trailer + # Support fname for legacy applications + if (fname is not None) == (self.fname is not None): + raise PdfOutputError( + "PdfWriter fname must be specified exactly once") + + fname = fname or self.fname + # Dump the data. We either have a filename or a preexisting # file object. preexisting = hasattr(fname, 'write') @@ -339,3 +381,5 @@ class PdfWriter(object): workitems += obj else: workitems += obj.values() + + replaceable = set(vars()) \ No newline at end of file diff --git a/pdfrw/py23_diffs.py b/pdfrw/py23_diffs.py index a0e0318..b3509d0 100644 --- a/pdfrw/py23_diffs.py +++ b/pdfrw/py23_diffs.py @@ -14,7 +14,9 @@ try: except NameError: def convert_load(s): - return s.decode('Latin-1') + if isinstance(s, bytes): + return s.decode('Latin-1') + return s def convert_store(s): return s.encode('Latin-1') @@ -44,3 +46,8 @@ try: xrange = xrange except NameError: xrange = range + +try: + intern = intern +except NameError: + from sys import intern diff --git a/pdfrw/tokens.py b/pdfrw/tokens.py index 5b061d5..2b69e02 100644 --- a/pdfrw/tokens.py +++ b/pdfrw/tokens.py @@ -15,7 +15,7 @@ import itertools from .objects import PdfString, PdfObject from .objects.pdfname import BasePdfName from .errors import log, PdfParseError -from .py23_diffs import nextattr +from .py23_diffs import nextattr, intern def linepos(fdata, loc): @@ -64,19 +64,7 @@ class PdfTokens(object): findparen = re.compile('(%s)[%s]*' % (p_literal_string_extend, whitespace), re.DOTALL).finditer - def _cacheobj(cache, obj, constructor): - ''' This caching relies on the constructors - returning something that will compare as - equal to the original obj. This works - fine with our PDF objects. - ''' - result = cache.get(obj) - if result is None: - result = constructor(obj) - cache[result] = result - return result - - def _gettoks(self, startloc, cacheobj=_cacheobj, + def _gettoks(self, startloc, intern=intern, delimiters=delimiters, findtok=findtok, findparen=findparen, PdfString=PdfString, PdfObject=PdfObject, BasePdfName=BasePdfName): @@ -95,24 +83,23 @@ class PdfTokens(object): fdata = self.fdata current = self.current = [(startloc, startloc)] cache = {} + get_cache = cache.get while 1: for match in findtok(fdata, current[0][1]): current[0] = tokspan = match.span() token = match.group(1) firstch = token[0] + toktype = intern if firstch not in delimiters: - token = cacheobj(cache, token, PdfObject) + toktype = PdfObject elif firstch in '/<(%': if firstch == '/': # PDF Name - encoded = token - token = cache.get(encoded) - if token is None: - token = cache[token] = BasePdfName(encoded) + toktype = BasePdfName elif firstch == '<': # << dict delim, or < hex string > if token[1:2] != '<': - token = cacheobj(cache, token, PdfString) + toktype = PdfString elif firstch == '(': # Literal string # It's probably simple, but maybe not @@ -145,7 +132,7 @@ class PdfTokens(object): loc, ends, nest = ends token = fdata[m_start:loc] + ')' * nest current[0] = m_start, ends - token = cacheobj(cache, token, PdfString) + toktype = PdfString elif firstch == '%': # Comment if self.strip_comments: @@ -154,7 +141,10 @@ class PdfTokens(object): self.exception(('Tokenizer logic incorrect -- ' 'should never get here')) - yield token + newtok = get_cache(token) + if newtok is None: + newtok = cache[token] = toktype(token) + yield newtok if current[0] is not tokspan: break else: @@ -168,6 +158,7 @@ class PdfTokens(object): self.iterator = iterator = self._gettoks(startloc) self.msgs_dumped = None if verbose else set() self.next = getattr(iterator, nextattr) + self.current = [(startloc, startloc)] def setstart(self, startloc): ''' Change the starting location. @@ -213,6 +204,8 @@ class PdfTokens(object): msg %= arg fdata = self.fdata begin, end = self.current[0] + if begin >= len(fdata): + return '%s (filepos %s past EOF %s)' % (msg, begin, len(fdata)) line, col = linepos(fdata, begin) if end > begin: tok = fdata[begin:end].rstrip() diff --git a/pdfrw/toreportlab.py b/pdfrw/toreportlab.py index 9f77d26..3434fbf 100644 --- a/pdfrw/toreportlab.py +++ b/pdfrw/toreportlab.py @@ -108,7 +108,7 @@ def _makearray(rldoc, pdfobj): def _makestr(rldoc, pdfobj): assert isinstance(pdfobj, (float, int, str)), repr(pdfobj) # TODO: Add fix for float like in pdfwriter - return str(getattr(pdfobj, 'encoded', pdfobj)) + return str(getattr(pdfobj, 'encoded', None) or pdfobj) def makerl_recurse(rldoc, pdfobj): diff --git a/pdfrw/uncompress.py b/pdfrw/uncompress.py index 6780d5d..39e8308 100644 --- a/pdfrw/uncompress.py +++ b/pdfrw/uncompress.py @@ -12,7 +12,7 @@ PNG predictor were originally transcribed from PyPDF2, which is probably an excellent source of additional filters. ''' import array -from .objects import PdfDict, PdfName +from .objects import PdfDict, PdfName, PdfArray from .errors import log from .py23_diffs import zlib, xrange, from_array, convert_load, convert_store @@ -37,7 +37,7 @@ def uncompress(mylist, leave_raw=False, warnings=set(), if isinstance(ftype, list) and len(ftype) == 1: # todo: multiple filters ftype = ftype[0] - parms = obj.DecodeParms + parms = obj.DecodeParms or obj.DP if ftype != flate: msg = ('Not decompressing: cannot use filter %s' ' with parameters %s') % (repr(ftype), repr(parms)) @@ -53,10 +53,18 @@ def uncompress(mylist, leave_raw=False, warnings=set(), error = str(s) else: error = None + if isinstance(parms, PdfArray): + oldparms = parms + parms = PdfDict() + for x in oldparms: + parms.update(x) if parms: predictor = int(parms.Predictor or 1) + columns = int(parms.Columns or 1) + colors = int(parms.Colors or 1) + bpc = int(parms.BitsPerComponent or 8) if 10 <= predictor <= 15: - data, error = flate_png(data, parms) + data, error = flate_png(data, predictor, columns, colors, bpc) elif predictor != 1: error = ('Unsupported flatedecode predictor %s' % repr(predictor)) @@ -74,7 +82,7 @@ def uncompress(mylist, leave_raw=False, warnings=set(), return ok -def flate_png(data, parms): +def flate_png(data, predictor=1, columns=1, colors=1, bpc=8): ''' PNG prediction is used to make certain kinds of data more compressible. Before the compression, each data byte is either left the same, or is set to be a delta @@ -87,9 +95,12 @@ def flate_png(data, parms): this technique for Xref stream objects, which are quite regular. ''' - columns = int(parms.Columns) + columnbytes = ((columns * colors * bpc) + 7) // 8 data = array.array('B', data) - rowlen = columns + 1 + rowlen = columnbytes + 1 + if predictor == 15: + padding = (rowlen - len(data)) % rowlen + data.extend([0] * padding) assert len(data) % rowlen == 0 rows = xrange(0, len(data), rowlen) for row_index in rows: diff --git a/releasing.txt b/releasing.txt index a108184..b186013 100644 --- a/releasing.txt +++ b/releasing.txt @@ -1,6 +1,6 @@ Notes on releasing, which is not yet fully automated: -1) Update version number both in __init__ and in setup +1) Update version number in pdfrw/__init__.py 2) Use pyroma diff --git a/setup.py b/setup.py index 7d94f95..a18132b 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,8 @@ setup( 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', - + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', 'Topic :: Multimedia :: Graphics :: Graphics Conversion', 'Topic :: Software Development :: Libraries', 'Topic :: Text Processing', @@ -35,4 +36,5 @@ setup( 'Topic :: Utilities', ], keywords='pdf vector graphics PDF nup watermark split join merge', + zip_safe=True, ) diff --git a/tests/expected.txt b/tests/expected.txt index 64eecdd..b1b7cca 100644 --- a/tests/expected.txt +++ b/tests/expected.txt @@ -11,8 +11,8 @@ examples/subset_b1c400de699af29ea3f1983bb26870ab_1-3_5 880a9578197130273ccb examples/unspread_d711b74110eefb4e9e6bf1a5bea16bfe 780a9abe26a9de0b5b95ee22c4835e4b examples/cat_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c 62bb9b746ff5932d3f1b88942d36a81d -examples/rotate_707e3e2d17cbe9ec2273414b3b63f333_270_1-4_7-8_10-50_52-56 841c980dfadf2cc47ad86e4649ca69b6 -examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c 41989bb2cb6225c6e14262ff5d4f151f +examples/rotate_707e3e2d17cbe9ec2273414b3b63f333_270_1-4_7-8_10-50_52-56 7633ba56641115050ba098ecbef8d331 +examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c fe2330d42b3bfc06212415f295752f0e examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c_-u e43e3ac0afe1cc242549424755dbf612 # All these are in the poster test @@ -20,10 +20,10 @@ examples/subset_1975ef8db7355b1d691bc79d0749574b_21 5057f345f1a1109a0e54276a examples/rotate_5057f345f1a1109a0e54276a68e8f8df_90_1 881f4dc8dcf069e707bf61af95492d86 examples/poster_881f4dc8dcf069e707bf61af95492d86 a34be06d22105b6c02394a9f278fec0d -examples/rl1/4up_b1c400de699af29ea3f1983bb26870ab 959d6246ad8bda72bd023e8681216d17 -examples/rl1/booklet_b1c400de699af29ea3f1983bb26870ab 45b4ae29a038271896b7264bbed63bdf -examples/rl1/subset_b1c400de699af29ea3f1983bb26870ab_3_5 822bce1cb9e053f1f3f6b922bf27fab8 -examples/rl1/platypus_pdf_template_b1c400de699af29ea3f1983bb26870ab 97ad6a8ca3fe7cc4e1f0ffb8475355e9 +examples/rl1/4up_b1c400de699af29ea3f1983bb26870ab e21dfdd9ae56ddb261dc3d02bf6da198 +examples/rl1/booklet_b1c400de699af29ea3f1983bb26870ab 410063b7fbae1c6d5af33758e2b43450 +examples/rl1/subset_b1c400de699af29ea3f1983bb26870ab_3_5 745f1ac31a18d86afb294a449b72cb98 +examples/rl1/platypus_pdf_template_b1c400de699af29ea3f1983bb26870ab 88bd087c4dc039ced05faea3920cbec5 # List things that need work here (typically cause exceptions) @@ -68,32 +68,33 @@ repaginate/06c86654f9a77e82f9adaa0086fc391c.pdf 848966fe40a1e3de842f82700dc6d67b repaginate/08f69084d72dabc5dfdcf5c1ff2a719f.pdf b8c60878b0e0ce81cb6e8777038166b1 repaginate/09715ec1a7b0f3a7ae02b3046f627b9f.pdf daf7cff9c0a15bbb347489f9fbda25f8 repaginate/0a61de50b5ee0ea4d5d69c95dab817a3.pdf c6cd38b1131c4b856f60ebfcf51da6f5 -repaginate/1975ef8db7355b1d691bc79d0749574b.pdf 53e5510be27db134edf3cf23873914af +repaginate/1975ef8db7355b1d691bc79d0749574b.pdf 43433398ccb1edaaee734f4949a5cc3c repaginate/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 20dc3be2affe9082564c01b1146d7598 -repaginate/1f5dd128c3757420a881a155f2f8ace3.pdf 019aead1450842406a04c508243e5161 -repaginate/22628a7ed578b622520325673ab2a4f2.pdf 255776a6956918c7b324dede711680ae +repaginate/1f5dd128c3757420a881a155f2f8ace3.pdf 7130f1568526247895856806b3879db4 +repaginate/22628a7ed578b622520325673ab2a4f2.pdf e312c9c588a5ccdb1a11ac37149b178b repaginate/2ac7c68e26a8ef797aead15e4875cc6d.pdf e7344551183415d6257e2cab2aef4a61 -repaginate/295d26e61a85635433f8e4b768953f60.pdf 13ece51f4d2ad25707982765abbcd789 +repaginate/295d26e61a85635433f8e4b768953f60.pdf a89a9fa39812ecd9fa5d6b9e785f389d +repaginate/2d31f356c37dadd04b83ecc4e9a739a0.pdf bc04b61b41cb51f6a1c1da79fb387795 repaginate/2fac0d9a189ca5fcef8626153d050be8.pdf 95fe3d9258ace5bdccb95a55c2c8cb22 -repaginate/319c998910453bc44d40c7748cd2cb79.pdf c1a19d1acc3f172711bdbea000cf392e +repaginate/319c998910453bc44d40c7748cd2cb79.pdf c0da6bf6db273bdb1385f408dcf063d0 repaginate/35df0b8cff4afec0c08f08c6a5bc9857.pdf 3568e1c885a461b350c790ec5b729af3 repaginate/365b9c95574ee8944370fe286905d0e8.pdf 84e5fc0d4f30ff8db05780fd244d9cf0 repaginate/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e repaginate/49e31fd074eca6af981d78d42d0078ec.pdf 77fd3fa86c7c0166a373b66cfef357d2 -repaginate/536dfc6fbadd87c03eb59375d091eb53.pdf d0b7467d7bd6c7f73b7764b06c0be1aa -repaginate/569f8094597bbe5b58efc3a7c6e14e87.pdf 6b0ab50c247ca43b70b2b2f27ee2c1a2 -repaginate/5f0cff36d0ad74536a6513a98a755016.pdf b65c2557988db8625c0761bab1d131f1 -repaginate/5f265db2736850782aeaba2571a3c749.pdf 9bb5644ede0ee7cf99642729eda76686 -repaginate/6a42c8c79b807bf164d31071749e07b0.pdf 33a231263e1a4203338b7b1052fc0091 -repaginate/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 93419e831e436d9093a153f35d3441c3 +repaginate/536dfc6fbadd87c03eb59375d091eb53.pdf afc90878b1306483dbde37c3a50b6a45 +repaginate/569f8094597bbe5b58efc3a7c6e14e87.pdf 894bf526c0a73ab70ebfd9bf3d614315 +repaginate/5f0cff36d0ad74536a6513a98a755016.pdf 3298a3a13439764102395a34d571ff69 +repaginate/5f265db2736850782aeaba2571a3c749.pdf 2e3046813ce6e40a39bd759a3c8a3c8c +repaginate/6a42c8c79b807bf164d31071749e07b0.pdf bf00d5e44869ae59eb859860d7d5373f +repaginate/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 612cdd84eeac797a1c42fc91756b6d9e repaginate/7037a992b80b60f0294016037baa9292.pdf dd41b0104f185206b51e7ffe5b07d261 -repaginate/707e3e2d17cbe9ec2273414b3b63f333.pdf 6c65526ab372d72cb185933e3d2584ef +repaginate/707e3e2d17cbe9ec2273414b3b63f333.pdf df4d756e2230c333f0c58ad354b5b51c repaginate/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2 repaginate/72eb207b8f882618899aa7a65d3cecda.pdf 0b64f19a8a39fadfa2a3eec3f1a01233 repaginate/97ba0a239cefa0dc727c2f1be050ec6c.pdf a94fe7183ce8979174b2ac16dcd9b1ea repaginate/9d8626d18b1d8807d271e6ffc409446a.pdf cdfcf8add1af9e612ba1a2ee06a6a273 repaginate/9f98322c243fe67726d56ccfa8e0885b.pdf 69503ac140a1e4f1322f9350646e3dae -repaginate/c55eb9a13859a7fbddd8af9c16eba3a7.pdf b0d1f3925423f9c3ecf4a47baa949f75 +repaginate/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 8cddb0f9741f7515107b1bce5dc90c83 repaginate/c5c895deecf7a7565393587e0d61be2b.pdf 59e350c6f7d7b89fab36a4019bb526fd repaginate/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf 3623b7f200818c63cb6838f9678a4840 repaginate/d6fd9567078b48c86710e9c49173781f.pdf 874b532f61139261f71afb5987dd2a68 @@ -101,6 +102,7 @@ repaginate/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 7d3c3ae13cc7d53e7fa6ef046e15dbaa repaginate/ec00d5825f47b9d0faa953b1709163c3.pdf 8e6a481476c2b3bdd64ce8e36f8fe273 repaginate/ed81787b83cc317c9f049643b853bea3.pdf 4636b68f294302417b81aaaadde1c73d + simple/06c86654f9a77e82f9adaa0086fc391c.pdf 6e2a2e063de895d28dfea9aacb9fe469 simple/08f69084d72dabc5dfdcf5c1ff2a719f.pdf 5a41601f6033356539e623091a3f79ef simple/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334 @@ -111,6 +113,7 @@ simple/1f5dd128c3757420a881a155f2f8ace3.pdf b0d01f9d6ac156326aeb14b940aa73e7 simple/22628a7ed578b622520325673ab2a4f2.pdf 1163cec415728899e997a29be465d02d simple/295d26e61a85635433f8e4b768953f60.pdf fe3b8960c7f877db05c7cd12c9c6e097 simple/2ac7c68e26a8ef797aead15e4875cc6d.pdf 2623eae06eada9587574f8ddd7fc80fa +simple/2d31f356c37dadd04b83ecc4e9a739a0.pdf 9af4794d366fbd5840836e6612ceedd2 simple/2fac0d9a189ca5fcef8626153d050be8.pdf 458501ecda909b00262b9654f0b09ebf simple/319c998910453bc44d40c7748cd2cb79.pdf 8c84e36ec1db8c1dbfaa312646e000b4 simple/35df0b8cff4afec0c08f08c6a5bc9857.pdf 0a2926c23ad916c449d5dadcfa9d38ef @@ -124,7 +127,7 @@ simple/5f265db2736850782aeaba2571a3c749.pdf d4d2e93ab22e866c86e32da84421f6f9 simple/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013 simple/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf fe8dd16dd7fef40338140e0610d0cbbf simple/7037a992b80b60f0294016037baa9292.pdf 6a2ef24e5f74dd74969ff8cefdfc6a05 -simple/707e3e2d17cbe9ec2273414b3b63f333.pdf 4bdf1e57a96ce42717110b4e55098c1a +simple/707e3e2d17cbe9ec2273414b3b63f333.pdf fb6a8eb3cdc2fbef125babe8815f3b70 simple/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2 simple/72eb207b8f882618899aa7a65d3cecda.pdf 4ce7ff29531cc417c26389af28dc1c5e simple/97ba0a239cefa0dc727c2f1be050ec6c.pdf c24873bab85b8ecc7c5433d8d802bceb @@ -138,3 +141,85 @@ simple/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 5bc96989bc4f4b6438da953443336124 simple/ec00d5825f47b9d0faa953b1709163c3.pdf 708f66049169c28ac39b0553908dc318 simple/ed81787b83cc317c9f049643b853bea3.pdf c227d627217dc6808c50e80063734d27 + +decompress/06c86654f9a77e82f9adaa0086fc391c.pdf 6e2a2e063de895d28dfea9aacb9fe469 +decompress/07b0ba4cff1c6ff73fd468b04b013457.pdf 499b9c1b1e1c76b7c5c0d5e3b62889e3 +decompress/08f69084d72dabc5dfdcf5c1ff2a719f.pdf ccadb859eff77d525bf86f6d821ccf1b +decompress/09715ec1a7b0f3a7ae02b3046f627b9f.pdf 2b9c8b26a92c7645cfefa1bfa8a8ab36 +decompress/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334 +decompress/1975ef8db7355b1d691bc79d0749574b.pdf a7d5eaf0a4259352898047f284e20b90 +decompress/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 40d1cc7e26213510319b519032aff637 +decompress/1f5dd128c3757420a881a155f2f8ace3.pdf b0d01f9d6ac156326aeb14b940aa73e7 +decompress/22628a7ed578b622520325673ab2a4f2.pdf b68c7bf46ad4b70addc3369ba669dc7b +decompress/295d26e61a85635433f8e4b768953f60.pdf 6f2ae8fb0ff853ed63537d8767ce13ad +decompress/2ac7c68e26a8ef797aead15e4875cc6d.pdf d8d5589991ce15c834f35b340e7147a9 +decompress/2d31f356c37dadd04b83ecc4e9a739a0.pdf 5a6b732690c42f07ae6a41c37cf28ff3 +decompress/2fac0d9a189ca5fcef8626153d050be8.pdf 998366ad30becd31bed711ba78c59a7f +decompress/319c998910453bc44d40c7748cd2cb79.pdf 7933a591caf3d49e45a42733bc48f99e +decompress/35df0b8cff4afec0c08f08c6a5bc9857.pdf e339ae7747898d2faba270473171692a +decompress/365b9c95574ee8944370fe286905d0e8.pdf 9da0100b5844c86e93093d0fbc78b3f6 +decompress/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e +decompress/49e31fd074eca6af981d78d42d0078ec.pdf 4e9bf31753ff7232de4c612a31bd21fc +decompress/536dfc6fbadd87c03eb59375d091eb53.pdf f755d2ef6052270121168d2341ad04b6 +decompress/569f8094597bbe5b58efc3a7c6e14e87.pdf aa782a7d553ec767ab61517996337f58 +decompress/5f0cff36d0ad74536a6513a98a755016.pdf 9caae4e3a21eba9e4aa76620e7508d56 +decompress/5f265db2736850782aeaba2571a3c749.pdf 836abcf6e6e1d39ad96481eb20e9b149 +decompress/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013 +decompress/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 226773cac79e1a5fed1379a0501a5df0 +decompress/7037a992b80b60f0294016037baa9292.pdf c9a3602b26d82ae145d9f5822125a158 +decompress/707e3e2d17cbe9ec2273414b3b63f333.pdf 3250a56e14a9855eccd67bb347808d24 +decompress/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2 +decompress/72eb207b8f882618899aa7a65d3cecda.pdf a4366874fb6db1d9a0c998361ea32b8d +decompress/97ba0a239cefa0dc727c2f1be050ec6c.pdf c24873bab85b8ecc7c5433d8d802bceb +decompress/9d8626d18b1d8807d271e6ffc409446a.pdf 6498bd354bb221516517a4c49bcb94f6 +decompress/9f98322c243fe67726d56ccfa8e0885b.pdf 4b53b63b0779b81d8f9569e66ca3d8ee +decompress/b107669d1dd69eabb89765fabb2cb321.pdf 56025c06ab8633575ddc6c6990d2fbf1 +decompress/b1c400de699af29ea3f1983bb26870ab.pdf 08a5de62129a96d8d9a8f27052bfb227 +decompress/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 8e0eb14c12fc89e7cbb4001861d7198f +decompress/c5c895deecf7a7565393587e0d61be2b.pdf 2cc3c75e56d5dd562ca5b1f994bd9d5c +decompress/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf aaed7215c60dbf19bb4fefe88602196a +decompress/d6fd9567078b48c86710e9c49173781f.pdf 1fd1b4bc184e64ea6260c30261adf9c4 +decompress/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 62b87ec47f1b93d75c32d0c78b6c2380 +decompress/ec00d5825f47b9d0faa953b1709163c3.pdf 708f66049169c28ac39b0553908dc318 +decompress/ed81787b83cc317c9f049643b853bea3.pdf 5c0a3bc5b19d58d48767bff8f31daae0 + +compress/06c86654f9a77e82f9adaa0086fc391c.pdf b6fb771b49971f2b63a197f3ef1531aa +compress/07b0ba4cff1c6ff73fd468b04b013457.pdf 499b9c1b1e1c76b7c5c0d5e3b62889e3 +compress/08f69084d72dabc5dfdcf5c1ff2a719f.pdf 3e7e53a92f96d52bbffe3ffa03d7b11e +compress/09715ec1a7b0f3a7ae02b3046f627b9f.pdf 563ffde527978517393d9166b02c17d3 +compress/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334 +compress/1975ef8db7355b1d691bc79d0749574b.pdf d505caa75f8becea1a1c810f4a143976 +compress/1c2af1d2b0db6cac3c8e558a26efd38b.pdf b78f4e45aef4149a068a0225ea1be88c +compress/1f5dd128c3757420a881a155f2f8ace3.pdf 22148c2a65129f936b8e8c67397e5bf6 +compress/22628a7ed578b622520325673ab2a4f2.pdf 54ec1fa64e64bfd146f13001444346f4 +compress/295d26e61a85635433f8e4b768953f60.pdf 2ed8eb04a8c66138883a43917cd9c0c5 +compress/2ac7c68e26a8ef797aead15e4875cc6d.pdf efe942d1e5b9f2f139c7e1f2e46ced24 +compress/2d31f356c37dadd04b83ecc4e9a739a0.pdf eedc938e6782e1d15755b5c54fffc17c +compress/2fac0d9a189ca5fcef8626153d050be8.pdf 2d1b8e82cdc82c82bec3969acf026d30 +compress/319c998910453bc44d40c7748cd2cb79.pdf 5b9ca8444a17db8cb6fa427da7a89e44 +compress/35df0b8cff4afec0c08f08c6a5bc9857.pdf 07c064df0fc0fd0c80c4a196b4c38403 +compress/365b9c95574ee8944370fe286905d0e8.pdf 1b98e92f74c2f5324cce5fc8fbe46c15 +compress/4805fdcd7e142e8df3c04c6ba06025af.pdf 4aa2e922739ba865da30a9917ddffe8e +compress/49e31fd074eca6af981d78d42d0078ec.pdf 7422b3d205650552ff81bc06c89c13ba +compress/536dfc6fbadd87c03eb59375d091eb53.pdf c18b0f0f8e633fe15b17772c701a76a9 +compress/569f8094597bbe5b58efc3a7c6e14e87.pdf 3ee711f7fc678787346dca5d06ee5192 +compress/5f0cff36d0ad74536a6513a98a755016.pdf bd2a1edf6299d5dc2e1ad6b5fc8bcc20 +compress/5f265db2736850782aeaba2571a3c749.pdf bb4898beac50171de7502f13925af80c +compress/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013 +compress/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 1c3fbae41e7cad7deca13fab93514bc7 +compress/7037a992b80b60f0294016037baa9292.pdf 9182a9765544e4a91404db65a6f951d7 +compress/707e3e2d17cbe9ec2273414b3b63f333.pdf 0e75dda73bf18d9968499277ab1a367e +compress/71a751ce2d93a6a5d6ff21735b701fb7.pdf faa7eb31789a3789f65de30a4e58e594 +compress/72eb207b8f882618899aa7a65d3cecda.pdf 0155549fc04357220cc6be541dda7bc1 +compress/97ba0a239cefa0dc727c2f1be050ec6c.pdf 067bfee3b2bd9c250e7c4157ff543a81 +compress/9d8626d18b1d8807d271e6ffc409446a.pdf 7c124d2d0b0c7b21cce91740dfb2a8fd +compress/9f98322c243fe67726d56ccfa8e0885b.pdf 3167fa11a3f1f4a06f90294b21e101b7 +compress/b107669d1dd69eabb89765fabb2cb321.pdf 56025c06ab8633575ddc6c6990d2fbf1 +compress/b1c400de699af29ea3f1983bb26870ab.pdf 6eaeef32b0e28959e7681c8b02d8814f +compress/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 6ef82921011eb79a9d860214e213c868 +compress/c5c895deecf7a7565393587e0d61be2b.pdf 30d87ac6aa59d65169c389ee3badbca8 +compress/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf e4c768be930e9980c970d51d5f447e24 +compress/d6fd9567078b48c86710e9c49173781f.pdf cbc8922b8bea08928463b287767ec229 +compress/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf e893e407b3c2366d4ca822ce80b45c2c +compress/ec00d5825f47b9d0faa953b1709163c3.pdf 9ba3db0dedec74c3d2a6f033f1b22a81 +compress/ed81787b83cc317c9f049643b853bea3.pdf 2ceda401f68a44a3fb1da4e0f9dfc578 diff --git a/tests/test_examples.py b/tests/test_examples.py index baa98a6..6871b80 100755 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -96,7 +96,7 @@ class TestOnePdf(unittest.TestCase): os.remove(scrub) subprocess.call(params) if scrub: - PdfWriter().addpages(PdfReader(scrub).pages).write(dstf) + PdfWriter(dstf).addpages(PdfReader(scrub).pages).write() with open(dstf, 'rb') as f: data = f.read() size = len(data) diff --git a/tests/test_pdfdict.py b/tests/test_pdfdict.py new file mode 100755 index 0000000..cdbe15d --- /dev/null +++ b/tests/test_pdfdict.py @@ -0,0 +1,39 @@ +#! /usr/bin/env python +# encoding: utf-8 +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas +# 2016 James Laird-Wah, Sydney, Australia +# MIT license -- See LICENSE.txt for details + +''' +Run from the directory above like so: +python -m tests.test_pdfstring +''' + + +from pdfrw import PdfDict, PdfName +from pdfrw.objects import PdfIndirect + +import unittest + + +class TestPdfDicts(unittest.TestCase): + + def test_indirect_set_get(self): + io = PdfIndirect((1,2,3)) + io.value = 42 + d = PdfDict() + d.Name = io + test, = (x for x in dict.values(d)) + self.assertEqual(test, io) + v = d['/Name'] + self.assertEqual(v, io.value) + test, = d + self.assertEqual(type(test), type(PdfName.Name)) + +def main(): + unittest.main() + + +if __name__ == '__main__': + main() diff --git a/tests/test_pdfreader_init.py b/tests/test_pdfreader_init.py new file mode 100644 index 0000000..d27d752 --- /dev/null +++ b/tests/test_pdfreader_init.py @@ -0,0 +1,28 @@ +#! /usr/bin/env python +import static_pdfs + +from pdfrw import PdfReader + +try: + import unittest2 as unittest +except ImportError: + import unittest + + +class TestPdfReaderInit(unittest.TestCase): + + def test_fname_binary_filelike(self): + with open(static_pdfs.pdffiles[0][0], 'rb') as pdf_file: + PdfReader(pdf_file) + + def test_fdata_binary(self): + with open(static_pdfs.pdffiles[0][0], 'rb') as pdf_file: + pdf_bytes = pdf_file.read() + PdfReader(fdata=pdf_bytes) + + +def main(): + unittest.main() + +if __name__ == '__main__': + main() diff --git a/tests/test_pdfstring.py b/tests/test_pdfstring.py old mode 100644 new mode 100755 index fce47ef..0ea91ad --- a/tests/test_pdfstring.py +++ b/tests/test_pdfstring.py @@ -1,4 +1,9 @@ #! /usr/bin/env python +# encoding: utf-8 +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas +# 2016 James Laird-Wah, Sydney, Australia +# MIT license -- See LICENSE.txt for details ''' Run from the directory above like so: @@ -6,30 +11,106 @@ python -m tests.test_pdfstring ''' -import pdfrw +from pdfrw import PdfString +from pdfrw.py23_diffs import convert_store + import unittest -class TestEncoding(unittest.TestCase): +class TestBaseEncoding(unittest.TestCase): - @staticmethod - def decode(value): - return pdfrw.objects.PdfString(value).decode() + def encode(self, value): + x = PdfString.encode(value) + if isinstance(value, type(u'')): + y = PdfString.from_unicode(value) + else: + y = PdfString.from_bytes(value) + self.assertEqual(x, y) + return x - @staticmethod - def encode(value): - return str(pdfrw.objects.PdfString.encode(value)) + def decode(self, value): + s = PdfString(value) + x = s.to_unicode() + y = s.decode() + self.assertEqual(x, y) + return x - @classmethod - def encode_decode(cls, value): - return cls.decode(cls.encode(value)) + def decode_bytes(self, decode_this, expected): + """ Decode to bytes""" + self.assertEqual(PdfString(decode_this).to_bytes(), + convert_store(expected)) - def roundtrip(self, value): - self.assertEqual(value, self.encode_decode(value)) + def roundtrip(self, value, expected=None): + result = self.encode(value) + self.assertEqual(value, self.decode(result)) + if expected is not None: + self.assertEqual(result, expected) + return result def test_doubleslash(self): self.roundtrip('\\') + self.roundtrip(r'\\') + def test_unicode_encoding(self): + # These chars are in PdfDocEncoding + self.assertEqual(self.roundtrip(u'PDF™©®')[0], '(') + # These chars are not in PdfDocEncoding + self.assertEqual(self.roundtrip(u'δΩσ')[0], '<') + # Check that we're doing a reasonable encoding + # Might want to change this later if we change the definition of reasonable + self.roundtrip(u'(\n\u00FF', '(\\(\n\xff)') + self.roundtrip(u'(\n\u0101', '') + + + def test_constructor(self): + obj = PdfString('hello') + + def test_continuation(self): + # See PDF 1.7 ref section 3.2 page 55 + s1 = PdfString('(These two strings are the same.)') + self.assertEqual(s1.decode(), s1[1:-1]) + s2 = PdfString('(These \\\ntwo strings \\\nare the same.)') + self.assertEqual(s1.decode(), s2.decode()) + s2 = PdfString(s2.replace('\n', '\r')) + self.assertEqual(s1.decode(), s2.decode()) + s2 = PdfString(s2.replace('\r', '\r\n')) + self.assertEqual(s1.decode(), s2.decode()) + + def test_hex_whitespace(self): + # See PDF 1.7 ref section 3.2 page 56 + self.assertEqual(self.decode('<41 \n\r\t\f\v42>'), 'AB') + + def test_unicode_escaped_decode(self): + # Some PDF producers happily put unicode strings in PdfDocEncoding, + # because the Unicode BOM and \0 are valid code points + decoded = self.decode('(\xfe\xff\0h\0e\0l\0l\0o)') + self.assertEqual(decoded, "hello") + + + def test_unescaping(self): + self.decode_bytes(r'( \( \) \\ \n \t \f \r \r\n \\n)', + ' ( ) \\ \n \t \f \r \r\n \\n') + + self.decode_bytes(r'(\b\010\10)', '\b\b\b') + self.decode_bytes('(\\n\n\\r\r\\t\t\\b\b\\f\f()\\1\\23\\0143)', + '\n\n\r\r\t\t\b\b\f\f()\001\023\f3') + self.decode_bytes(r'(\\\nabc)', '\\\nabc') + self.decode_bytes(r'(\ )', ' ') + + def test_BOM_variants(self): + self.roundtrip(u'\ufeff', '') + self.roundtrip(u'\ufffe', '') + self.roundtrip(u'\xfe\xff', '') + self.roundtrip(u'\xff\xfe', '(\xff\xfe)') + self.assertRaises(UnicodeError, PdfString.from_unicode, + u'þÿ blah', text_encoding='pdfdocencoding') + + def test_byte_encode(self): + self.assertEqual(self.encode(b'ABC'), '(ABC)') + + def test_nullstring(self): + self.assertEqual(PdfString('<>').to_bytes(), b'') + self.assertEqual(PdfString('()').to_bytes(), b'') def main(): unittest.main() diff --git a/tests/test_roundtrip.py b/tests/test_roundtrip.py index cb3645e..a8349a6 100755 --- a/tests/test_roundtrip.py +++ b/tests/test_roundtrip.py @@ -79,11 +79,12 @@ class TestOnePdf(unittest.TestCase): result = 'skip -- encrypt' hash = '------skip-encrypt-no-file------' return self.skipTest('File encrypted') - writer = pdfrw.PdfWriter(compress=compress) + writer = pdfrw.PdfWriter(dstf, compress=compress) if repaginate: writer.addpages(trailer.pages) - trailer = None - writer.write(dstf, trailer) + else: + writer.trailer = trailer + writer.write() with open(dstf, 'rb') as f: data = f.read() size = len(data) @@ -112,15 +113,20 @@ def build_tests(): def test(self): self.roundtrip(*args, **kw) return test - for mytest, repaginate in ( - ('simple', False), - ('repaginate', True) + for mytest, repaginate, decompress, compress in ( + ('simple', False, False, False), + ('repaginate', True, False, False), + ('decompress', False, True, False), + ('compress', False, True, True), ): for srcf in static_pdfs.pdffiles[0]: basename = os.path.basename(srcf) test_name = 'test_%s_%s' % (mytest, basename) test = test_closure(mytest, basename, srcf, - repaginate=repaginate) + repaginate=repaginate, + decompress=decompress, + compress=compress, + ) setattr(TestOnePdf, test_name, test) build_tests() diff --git a/tests/update_expected.py b/tests/update_expected.py new file mode 100755 index 0000000..bed5331 --- /dev/null +++ b/tests/update_expected.py @@ -0,0 +1,84 @@ +#! /usr/bin/env python2 +""" +Put old (good) results in ramdisk/reference, +then generate new (unknown) test results in ramdisk/tmp_results, +THEN SWITCH BACK TO KNOWN GOOD SYSTEM, and finally: + +run this to update any checksums in expected.txt where both versions +parse to same PDFs. +""" + +import os +import hashlib +from pdfrw import PdfReader, PdfWriter, PdfArray, PdfDict, PdfObject + + +def make_canonical(trailer): + ''' Canonicalizes a PDF. Assumes everything + is a Pdf object already. + ''' + visited = set() + workitems = list(trailer.values()) + while workitems: + obj = workitems.pop() + objid = id(obj) + if objid in visited: + continue + visited.add(objid) + obj.indirect = True + if isinstance(obj, (PdfArray, PdfDict)): + if isinstance(obj, PdfArray): + workitems += obj + else: + workitems += obj.values() + return trailer + +with open('expected.txt', 'rb') as f: + expected = f.read() + +def get_digest(fname): + with open(fname, 'rb') as f: + data = f.read() + if data: + return hashlib.md5(data).hexdigest() + +tmp = '_temp.pdf' +count = 0 +goodcount = 0 + +changes = [] +for (srcpath, _, filenames) in os.walk('ramdisk/reference'): + for name in filenames: + if not name.endswith('.pdf'): + continue + src = os.path.join(srcpath, name) + dst = src.replace('/reference/', '/tmp_results/') + if not os.path.exists(dst): + continue + src_digest = get_digest(src) + if not src_digest or src_digest not in expected: + continue + print src + count += 1 + trailer = make_canonical(PdfReader(src)) + out = PdfWriter(tmp) + out.write(trailer=trailer) + match_digest = get_digest(tmp) + if not match_digest: + continue + trailer = make_canonical(PdfReader(dst)) + out = PdfWriter(tmp) + out.write(trailer=trailer) + if get_digest(tmp) != match_digest: + continue + goodcount += 1 + print "OK" + changes.append((src_digest, get_digest(dst))) + +print count, goodcount + +for stuff in changes: + expected = expected.replace(*stuff) + +with open('expected.txt', 'wb') as f: + f.write(expected)