From 3d7941c076e15521147fd418bed88655e9fd990b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Pr=C3=A9vot?= Date: Tue, 29 Dec 2015 03:05:28 +0100 Subject: [PATCH] pdfrw (0.2-2) unstable; urgency=medium MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * QA upload * Don’t run testsuite at build time (closes: #808678): They rely on an external library containing data without any documented copyright # imported from the archive --- .gitignore | 52 ++ .travis.yml | 17 + LICENSE.txt | 62 +++ MANIFEST.in | 2 + README.rst | 745 ++++++++++++++++++++++++++ debian/changelog | 68 +++ debian/compat | 1 + debian/control | 83 +++ debian/copyright | 44 ++ debian/python-pdfrw-doc.examples | 1 + debian/rules | 8 + debian/source/format | 1 + debian/watch | 3 + examples/4up.py | 33 ++ examples/README.txt | 32 ++ examples/alter.py | 24 + examples/booklet.py | 39 ++ examples/cat.py | 35 ++ examples/extract.py | 27 + examples/poster.py | 43 ++ examples/print_two.py | 32 ++ examples/rl1/4up.py | 56 ++ examples/rl1/README.txt | 9 + examples/rl1/booklet.py | 68 +++ examples/rl1/platypus_pdf_template.py | 108 ++++ examples/rl1/subset.py | 42 ++ examples/rl2/README.txt | 5 + examples/rl2/copy.py | 32 ++ examples/rl2/decodegraphics.py | 444 +++++++++++++++ examples/rotate.py | 41 ++ examples/subset.py | 29 + examples/unspread.py | 32 ++ examples/watermark.py | 33 ++ pdfrw/__init__.py | 22 + pdfrw/buildxobj.py | 356 ++++++++++++ pdfrw/compress.py | 27 + pdfrw/errors.py | 38 ++ pdfrw/findobjs.py | 138 +++++ pdfrw/objects/__init__.py | 19 + pdfrw/objects/pdfarray.py | 67 +++ pdfrw/objects/pdfdict.py | 233 ++++++++ pdfrw/objects/pdfindirect.py | 22 + pdfrw/objects/pdfname.py | 80 +++ pdfrw/objects/pdfobject.py | 11 + pdfrw/objects/pdfstring.py | 74 +++ pdfrw/pagemerge.py | 242 +++++++++ pdfrw/pdfreader.py | 591 ++++++++++++++++++++ pdfrw/pdfwriter.py | 341 ++++++++++++ pdfrw/py23_diffs.py | 46 ++ pdfrw/tokens.py | 236 ++++++++ pdfrw/toreportlab.py | 146 +++++ pdfrw/uncompress.py | 106 ++++ releasing.txt | 10 + setup.cfg | 5 + setup.py | 38 ++ tests/__init__.py | 1 + tests/checkdiffs.py | 81 +++ tests/expected.py | 41 ++ tests/expected.txt | 140 +++++ tests/myprofile.py | 5 + tests/test_examples.py | 195 +++++++ tests/test_pdfstring.py | 39 ++ tests/test_roundtrip.py | 132 +++++ 63 files changed, 5733 insertions(+) create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 LICENSE.txt create mode 100644 MANIFEST.in create mode 100644 README.rst create mode 100644 debian/changelog create mode 100644 debian/compat create mode 100644 debian/control create mode 100644 debian/copyright create mode 100644 debian/python-pdfrw-doc.examples create mode 100755 debian/rules create mode 100644 debian/source/format create mode 100644 debian/watch create mode 100755 examples/4up.py create mode 100644 examples/README.txt create mode 100755 examples/alter.py create mode 100755 examples/booklet.py create mode 100755 examples/cat.py create mode 100755 examples/extract.py create mode 100755 examples/poster.py create mode 100755 examples/print_two.py create mode 100755 examples/rl1/4up.py create mode 100644 examples/rl1/README.txt create mode 100755 examples/rl1/booklet.py create mode 100755 examples/rl1/platypus_pdf_template.py create mode 100755 examples/rl1/subset.py create mode 100644 examples/rl2/README.txt create mode 100755 examples/rl2/copy.py create mode 100644 examples/rl2/decodegraphics.py create mode 100755 examples/rotate.py create mode 100755 examples/subset.py create mode 100755 examples/unspread.py create mode 100755 examples/watermark.py create mode 100644 pdfrw/__init__.py create mode 100644 pdfrw/buildxobj.py create mode 100644 pdfrw/compress.py create mode 100644 pdfrw/errors.py create mode 100644 pdfrw/findobjs.py create mode 100644 pdfrw/objects/__init__.py create mode 100644 pdfrw/objects/pdfarray.py create mode 100644 pdfrw/objects/pdfdict.py create mode 100644 pdfrw/objects/pdfindirect.py create mode 100644 pdfrw/objects/pdfname.py create mode 100644 pdfrw/objects/pdfobject.py create mode 100644 pdfrw/objects/pdfstring.py create mode 100644 pdfrw/pagemerge.py create mode 100644 pdfrw/pdfreader.py create mode 100755 pdfrw/pdfwriter.py create mode 100644 pdfrw/py23_diffs.py create mode 100644 pdfrw/tokens.py create mode 100644 pdfrw/toreportlab.py create mode 100644 pdfrw/uncompress.py create mode 100644 releasing.txt create mode 100644 setup.cfg create mode 100644 setup.py create mode 100644 tests/__init__.py create mode 100755 tests/checkdiffs.py create mode 100644 tests/expected.py create mode 100644 tests/expected.txt create mode 100644 tests/myprofile.py create mode 100755 tests/test_examples.py create mode 100644 tests/test_pdfstring.py create mode 100755 tests/test_roundtrip.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6260e55 --- /dev/null +++ b/.gitignore @@ -0,0 +1,52 @@ +# Development artifacts +diffs.txt +examples/*.pdf +examples/rl*/*.pdf +tests/*.pdf +examples/pdfrw +examples/rl*/pdfrw +tests/pdfrw +tests/static_pdfs +tests/ramdisk +tests/saved_results +wiki/ + + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# Distribution / packaging +.Python +env/ +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +lib64 +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg +pyvenv.cfg +pip-selfcheck.json + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Sphinx documentation +docs/_build/ diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..caa88f5 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,17 @@ +language: python +python: + - "2.6" + - "2.7" + - "3.3" + - "3.4" + - "nightly" +# command to install dependencies +before_install: + - "git clone https://github.com/pmaupin/static_pdfs tests/static_pdfs" +install: + - "pip install ." + - "pip install reportlab || true" + - "pip install zlib || true" + - "pip install unittest2 || true" +# command to run tests +script: "cd tests; /usr/bin/env PYTHONPATH=. py.test" diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..8d3c13d --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,62 @@ +pdfrw (github.com/pmaupin/pdfrw) + +The majority of pdfrw was written by Patrick Maupin and is licensed +under the MIT license (reproduced below). Other contributors include +Attila Tajti and Nerijus Mika. It appears that some of the decompression +code was based on the decompressor from PyPDF2, which was written by +Mathieu Fenniak and licensed under the BSD license (also reproduced below). + +Please add any missing authors here: + +Copyright (c) 2006-2015 Patrick Maupin. All rights reserved. +Copyright (c) 2006 Mathieu Fenniak. All rights reserved. +Copyright (c) 2010 Attila Tajti. All rights reserved. +Copyright (c) 2012 Nerijus Mika. All rights reserved. + + +MIT License: + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + + +BSD License: + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +* The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..f90ac68 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include *.txt *.in *.rst +recursive-include examples *.txt *.py diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..2c91345 --- /dev/null +++ b/README.rst @@ -0,0 +1,745 @@ +============= +pdfrw 0.2b1 +============= + +:Author: Patrick Maupin + +.. contents:: + :backlinks: none + +.. sectnum:: + +Introduction +============ + +**pdfrw** is a Python library and utility that reads and writes PDF files: + +* Version 0.2 is tested and works on Python 2.6, 2.7, 3.3, and 3.4. +* Operations include subsetting, merging, rotating, modifying metadata, etc. +* The fastest pure Python PDF parser available +* Has been used for years by a printer in pre-press production +* Can be used with rst2pdf to faithfully reproduce vector images +* Can be used either standalone, or in conjunction with `reportlab`__ + to reuse existing PDFs in new ones +* Permissively licensed + +__ http://www.reportlab.org/ + + +pdfrw will faithfully reproduce vector formats without +rasterization, so the rst2pdf package has used pdfrw +for PDF and SVG images by default since March 2010. + +pdfrw can also be used in conjunction with reportlab, in order +to re-use portions of existing PDFs in new PDFs created with +reportlab. + + +Examples +========= + +The library comes with several examples that show operation both with +and without reportlab. + + +All examples +------------------ + +The examples directory has a few scripts which use the library. +Note that if these examples do not work with your PDF, you should +try to use pdftk to uncompress and/or unencrypt them first. + +* `4up.py`__ will shrink pages down and place 4 of them on + each output page. +* `alter.py`__ shows an example of modifying metadata, without + altering the structure of the PDF. +* `booklet.py`__ shows an example of creating a 2-up output + suitable for printing and folding (e.g on tabloid size paper). +* `cat.py`__ shows an example of concatenating multiple PDFs together. +* `extract.py`__ will extract images and Form XObjects (embedded pages) + from existing PDFs to make them easier to use and refer to from + new PDFs (e.g. with reportlab or rst2pdf). +* `poster.py`__ increases the size of a PDF so it can be printed + as a poster. +* `print_two.py`__ Allows creation of 8.5 X 5.5" booklets by slicing + 8.5 X 11" paper apart after printing. +* `rotate.py`__ Rotates all or selected pages in a PDF. +* `subset.py`__ Creates a new PDF with only a subset of pages from the + original. +* `unspread.py`__ Takes a 2-up PDF, and splits out pages. +* `watermark.py`__ Adds a watermark PDF image over or under all the pages + of a PDF. +* `rl1/4up.py`__ Another 4up example, using reportlab canvas for output. +* `rl1/booklet.py`__ Another booklet example, using reportlab canvas for + output. +* `rl1/subset.py`__ Another subsetting example, using reportlab canvas for + output. +* `rl1/platypus_pdf_template.py`__ Aother watermarking example, using + reportlab canvas and generated output for the document. Contributed + by user asannes. +* `rl2`__ Experimental code for parsing graphics. Needs work. + +__ https://github.com/pmaupin/pdfrw/tree/master/examples/4up.py +__ https://github.com/pmaupin/pdfrw/tree/master/examples/alter.py +__ https://github.com/pmaupin/pdfrw/tree/master/examples/booklet.py +__ https://github.com/pmaupin/pdfrw/tree/master/examples/cat.py +__ https://github.com/pmaupin/pdfrw/tree/master/examples/extract.py +__ https://github.com/pmaupin/pdfrw/tree/master/examples/poster.py +__ https://github.com/pmaupin/pdfrw/tree/master/examples/print_two.py +__ https://github.com/pmaupin/pdfrw/tree/master/examples/rotate.py +__ https://github.com/pmaupin/pdfrw/tree/master/examples/subset.py +__ https://github.com/pmaupin/pdfrw/tree/master/examples/unspread.py +__ https://github.com/pmaupin/pdfrw/tree/master/examples/watermark.py +__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/4up.py +__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/booklet.py +__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/subset.py +__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/platypus_pdf_template.py +__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl2/ + +Notes on selected examples +------------------------------------ + +Reorganizing pages and placing them two-up +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A printer with a fancy printer and/or a full-up copy of Acrobat can +easily turn your small PDF into a little booklet (for example, print 4 +letter-sized pages on a single 11" x 17"). + +But that assumes several things, including that the personnel know how +to operate the hardware and software. `booklet.py`__ lets you turn your PDF +into a preformatted booklet, to give them fewer chances to mess it up. + +__ https://github.com/pmaupin/pdfrw/tree/master/examples/booklet.py + +Adding or modifying metadata +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The `cat.py`__ example will accept multiple input files on the command +line, concatenate them and output them to output.pdf, after adding some +nonsensical metadata to the output PDF file. + +__ https://github.com/pmaupin/pdfrw/tree/master/examples/cat.py + +The `alter.py`__ example alters a single metadata item in a PDF, +and writes the result to a new PDF. + +__ https://github.com/pmaupin/pdfrw/tree/master/examples/alter.py + + +One difference is that, since **cat** is creating a new PDF structure, +and **alter** is attempting to modify an existing PDF structure, the +PDF produced by alter (and also by watermark.py) *should* be +more faithful to the original (except for the desired changes). + +For example, the alter.py navigation should be left intact, whereas with +cat.py it will be stripped. + + +Rotating and doubling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you ever want to print something that is like a small booklet, but +needs to be spiral bound, you either have to do some fancy rearranging, +or just waste half your paper. + +The `print_two.py`__ example program will, for example, make two side-by-side +copies each page of of your PDF on a each output sheet. + +__ https://github.com/pmaupin/pdfrw/tree/master/examples/print_two.py + +But, every other page is flipped, so that you can print double-sided and +the pages will line up properly and be pre-collated. + +Graphics stream parsing proof of concept +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The `copy.py`__ script shows a simple example of reading in a PDF, and +using the decodegraphics.py module to try to write the same information +out to a new PDF through a reportlab canvas. (If you know about reportlab, +you know that if you can faithfully render a PDF to a reportlab canvas, you +can do pretty much anything else with that PDF you want.) This kind of +low level manipulation should be done only if you really need to. +decodegraphics is really more than a proof of concept than anything +else. For most cases, just use the Form XObject capability, as shown in +the examples/rl1/booklet.py demo. + +__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl2/copy.py + +pdfrw philosophy +================== + +Core library +------------- + +The philosophy of the library portion of pdfrw is to provide intuitive +functions to read, manipulate, and write PDF files. There should be +minimal leakage between abstraction layers, although getting useful +work done makes "pure" functionality separation difficult. + +A key concept supported by the library is the use of Form XObjects, +which allow easy embedding of pieces of one PDF into another. + +Addition of core support to the library is typically done carefully +and thoughtfully, so as not to clutter it up with too many special +cases. + +There are a lot of incorrectly formatted PDFs floating around; support +for these is added in some cases. The decision is often based on what +acroread and okular do with the PDFs; if they can display them properly, +then eventually pdfrw should, too, if it is not too difficult or costly. + +Contributions are welcome; one user has contributed some decompression +filters and the ability to process PDF 1.5 stream objects. Additional +functionality that would obviously be useful includes additional +decompression filters, the ability to process password-protected PDFs, +and the ability to output linearized PDFs. + +Examples +-------- + +The philosophy of the examples is to provide small, easily-understood +examples that showcase pdfrw functionality. + + +PDF files and Python +====================== + +Introduction +------------ + +In general, PDF files conceptually map quite well to Python. The major +objects to think about are: + +- **strings**. Most things are strings. These also often decompose + naturally into +- **lists of tokens**. Tokens can be combined to create higher-level + objects like +- **arrays** and +- **dictionaries** and +- **Contents streams** (which can be more streams of tokens) + +Difficulties +------------ + +The apparent primary difficulty in mapping PDF files to Python is the +PDF file concept of "indirect objects." Indirect objects provide +the efficiency of allowing a single piece of data to be referred to +from more than one containing object, but probably more importantly, +indirect objects provide a way to get around the chicken and egg +problem of circular object references when mapping arbitrary data +structures to files. To flatten out a circular reference, an indirect +object is *referred to* instead of being *directly included* in another +object. PDF files have a global mechanism for locating indirect objects, +and they all have two reference numbers (a reference number and a +"generation" number, in case you wanted to append to the PDF file +rather than just rewriting the whole thing). + +pdfrw automatically handles indirect references on reading in a PDF +file. When pdfrw encounters an indirect PDF file object, the +corresponding Python object it creates will have an 'indirect' attribute +with a value of True. When writing a PDF file, if you have created +arbitrary data, you just need to make sure that circular references are +broken up by putting an attribute named 'indirect' which evaluates to +True on at least one object in every cycle. + +Another PDF file concept that doesn't quite map to regular Python is a +"stream". Streams are dictionaries which each have an associated +unformatted data block. pdfrw handles streams by placing a special +attribute on a subclassed dictionary. + +Usage Model +----------- + +The usage model for pdfrw treats most objects as strings (it takes their +string representation when writing them to a file). The two main +exceptions are the PdfArray object and the PdfDict object. + +PdfArray is a subclass of list with two special features. First, +an 'indirect' attribute allows a PdfArray to be written out as +an indirect PDF object. Second, pdfrw reads files lazily, so +PdfArray knows about, and resolves references to other indirect +objects on an as-needed basis. + +PdfDict is a subclass of dict that also has an indirect attribute +and lazy reference resolution as well. (And the subclassed +IndirectPdfDict has indirect automatically set True). + +But PdfDict also has an optional associated stream. The stream object +defaults to None, but if you assign a stream to the dict, it will +automatically set the PDF /Length attribute for the dictionary. + +Finally, since PdfDict instances are indexed by PdfName objects (which +always start with a /) and since most (all?) standard Adobe PdfName +objects use names formatted like "/CamelCase", it makes sense to allow +access to dictionary elements via object attribute accesses as well as +object index accesses. So usage of PdfDict objects is normally via +attribute access, although non-standard names (though still with a +leading slash) can be accessed via dictionary index lookup. + +Reading PDFs +~~~~~~~~~~~~~~~ + +The PdfReader object is a subclass of PdfDict, which allows easy access +to an entire document:: + + >>> from pdfrw import PdfReader + >>> x = PdfReader('source.pdf') + >>> x.keys() + ['/Info', '/Size', '/Root'] + >>> x.Info + {'/Producer': '(cairo 1.8.6 (http://cairographics.org))', + '/Creator': '(cairo 1.8.6 (http://cairographics.org))'} + >>> x.Root.keys() + ['/Type', '/Pages'] + +Info, Size, and Root are retrieved from the trailer of the PDF file. + +In addition to the tree structure, pdfrw creates a special attribute +named *pages*, that is a list of all the pages in the document. pdfrw +creates the *pages* attribute as a simplification for the user, because +the PDF format allows arbitrarily complicated nested dictionaries to +describe the page order. Each entry in the *pages* list is the PdfDict +object for one of the pages in the file, in order. + +:: + + >>> len(x.pages) + 1 + >>> x.pages[0] + {'/Parent': {'/Kids': [{...}], '/Type': '/Pages', '/Count': '1'}, + '/Contents': {'/Length': '11260', '/Filter': None}, + '/Resources': ... (Lots more stuff snipped) + >>> x.pages[0].Contents + {'/Length': '11260', '/Filter': None} + >>> x.pages[0].Contents.stream + 'q\n1 1 1 rg /a0 gs\n0 0 0 RG 0.657436 + w\n0 J\n0 j\n[] 0.0 d\n4 M q' ... (Lots more stuff snipped) + +Writing PDFs +~~~~~~~~~~~~~~~ + +As you can see, it is quite easy to dig down into a PDF document. But +what about when it's time to write it out? + +:: + + >>> from pdfrw import PdfWriter + >>> y = PdfWriter() + >>> y.addpage(x.pages[0]) + >>> y.write('result.pdf') + +That's all it takes to create a new PDF. You may still need to read the +`Adobe PDF reference manual`__ to figure out what needs to go *into* +the PDF, but at least you don't have to sweat actually building it +and getting the file offsets right. + +__ http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf + +Manipulating PDFs in memory +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For the most part, pdfrw tries to be agnostic about the contents of +PDF files, and support them as containers, but to do useful work, +something a little higher-level is required, so pdfrw works to +understand a bit about the contents of the containers. For example: + +- PDF pages. pdfrw knows enough to find the pages in PDF files you read + in, and to write a set of pages back out to a new PDF file. +- Form XObjects. pdfrw can take any page or rectangle on a page, and + convert it to a Form XObject, suitable for use inside another PDF + file. It knows enough about these to perform scaling, rotation, + and positioning. +- reportlab objects. pdfrw can recursively create a set of reportlab + objects from its internal object format. This allows, for example, + Form XObjects to be used inside reportlab, so that you can reuse + content from an existing PDF file when building a new PDF with + reportlab. + +There are several examples that demonstrate these features in +the example code directory. + +Missing features +~~~~~~~~~~~~~~~~~~~~~~~ + +Even as a pure PDF container library, pdfrw comes up a bit short. It +does not currently support: + +- Most compression/decompression filters +- encryption + +`pdftk`__ is a wonderful command-line +tool that can convert your PDFs to remove encryption and compression. +However, in most cases, you can do a lot of useful work with PDFs +without actually removing compression, because only certain elements +inside PDFs are actually compressed. + +__ https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/ + +Library internals +================== + +Introduction +------------ + +**pdfrw** currently consists of 19 modules organized into a main +package and one sub-package. + +The `__init.py__`__ module does the usual thing of importing a few +major attributes from some of the submodules, and the `errors.py`__ +module supports logging and exception generation. + +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/__init__.py +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/errors.py + + +PDF object model support +-------------------------- + +The `objects`__ sub-package contains one module for each of the +internal representations of the kinds of basic objects that exist +in a PDF file, with the `objects/__init__.py`__ module in that +package simply gathering them up and making them available to the +main pdfrw package. + +One feature that all the PDF object classes have in common is the +inclusion of an 'indirect' attribute. If 'indirect' exists and evaluates +to True, then when the object is written out, it is written out as an +indirect object. That is to say, it is addressable in the PDF file, and +could be referenced by any number (including zero) of container objects. +This indirect object capability saves space in PDF files by allowing +objects such as fonts to be referenced from multiple pages, and also +allows PDF files to contain internal circular references. This latter +capability is used, for example, when each page object has a "parent" +object in its dictionary. + +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/ +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/__init__.py + +Ordinary objects +~~~~~~~~~~~~~~~~ + +The `objects/pdfobject.py`__ module contains the PdfObject class, which is +a subclass of str, and is the catch-all object for any PDF file elements +that are not explicitly represented by other objects, as described below. + +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/pdfobject.py + +Name objects +~~~~~~~~~~~~ + +The `objects/pdfname.py`__ module contains the PdfName singleton object, +which will convert a string into a PDF name by prepending a slash. It can +be used either by calling it or getting an attribute, e.g.:: + + PdfName.Rotate == PdfName('Rotate') == PdfObject('/Rotate') + +In the example above, there is a slight difference between the objects +returned from PdfName, and the object returned from PdfObject. The +PdfName objects are actually objects of class "BasePdfName". This +is important, because only these may be used as keys in PdfDict objects. + +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/pdfname.py + +String objects +~~~~~~~~~~~~~~ + +The `objects/pdfstring.py`__ +module contains the PdfString class, which is a subclass of str that is +used to represent encoded strings in a PDF file. The class has encode +and decode methods for the strings. + +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/pdfstring.py + + +Array objects +~~~~~~~~~~~~~ + +The `objects/pdfarray.py`__ +module contains the PdfArray class, which is a subclass of list that is +used to represent arrays in a PDF file. A regular list could be used +instead, but use of the PdfArray class allows for an indirect attribute +to be set, and also allows for proxying of unresolved indirect objects +(that haven't been read in yet) in a manner that is transparent to pdfrw +clients. + +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/pdfarray.py + +Dict objects +~~~~~~~~~~~~ + +The `objects/pdfdict.py`__ +module contains the PdfDict class, which is a subclass of dict that is +used to represent dictionaries in a PDF file. A regular dict could be +used instead, but the PdfDict class matches the requirements of PDF +files more closely: + +* Transparent (from the library client's viewpoint) proxying + of unresolved indirect objects +* Return of None for non-existent keys (like dict.get) +* Mapping of attribute accesses to the dict itself + (pdfdict.Foo == pdfdict[NameObject('Foo')]) +* Automatic management of following stream and /Length attributes + for content dictionaries +* Indirect attribute +* Other attributes may be set for private internal use of the + library and/or its clients. +* Support for searching parent dictionaries for PDF "inheritable" + attributes. + +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/pdfdict.py + +If a PdfDict has an associated data stream in the PDF file, the stream +is accessed via the 'stream' (all lower-case) attribute. Setting the +stream attribute on the PdfDict will automatically set the /Length attribute +as well. If that is not what is desired (for example if the the stream +is compressed), then _stream (same name with an underscore) may be used +to associate the stream with the PdfDict without setting the length. + +To set private attributes (that will not be written out to a new PDF +file) on a dictionary, use the 'private' attribute:: + + mydict.private.foo = 1 + +Once the attribute is set, it may be accessed directly as an attribute +of the dictionary:: + + foo = mydict.foo + +Some attributes of PDF pages are "inheritable." That is, they may +belong to a parent dictionary (or a parent of a parent dictionary, etc.) +The "inheritable" attribute allows for easy discovery of these:: + + mediabox = mypage.inheritable.MediaBox + + +Proxy objects +~~~~~~~~~~~~~ + +The `objects/pdfindirect.py`__ +module contains the PdfIndirect class, which is a non-transparent proxy +object for PDF objects that have not yet been read in and resolved from +a file. Although these are non-transparent inside the library, client code +should never see one of these -- they exist inside the PdfArray and PdfDict +container types, but are resolved before being returned to a client of +those types. + +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/pdfindirect.py + + +File reading, tokenization and parsing +-------------------------------------- + +`pdfreader.py`__ +contains the PdfReader class, which can read a PDF file (or be passed a +file object or already read string) and parse it. It uses the PdfTokens +class in `tokens.py`__ for low-level tokenization. + +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/pdfreader.py +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/tokens.py + + +The PdfReader class does not, in general, parse into containers (e.g. +inside the content streams). There is a proof of concept for doing that +inside the examples/rl2 subdirectory, but that is slow and not well-developed, +and not useful for most applications. + +An instance of the PdfReader class is an instance of a PdfDict -- the +trailer dictionary of the PDF file, to be exact. It will have a private +attribute set on it that is named 'pages' that is a list containing all +the pages in the file. + +When instantiating a PdfReader object, there are options available +for decompressing all the objects in the file. pdfrw does not currently +have very many options for decompression, so this is not all that useful, +except in the specific case of compressed object streams. + +Also, there are no options for decryption yet. If you have PDF files +that are encrypted or heavily compressed, you may find that using another +program like pdftk on them can make them readable by pdfrw. + +In general, the objects are read from the file lazily, but this is not +currently true with compressed object streams -- all of these are decompressed +and read in when the PdfReader is instantiated. + + +File output +----------- + +`pdfwriter.py`__ +contains the PdfWriter class, which can create and output a PDF file. + +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/pdfwriter.py + +There are a few options available when creating and using this class. + +In the simplest case, an instance of PdfWriter is instantiated, and +then pages are added to it from one or more source files (or created +programmatically), and then the write method is called to dump the +results out to a file. + +If you have a source PDF and do not want to disturb the structure +of it too badly, then you may pass its trailer directly to PdfWriter +rather than letting PdfWriter construct one for you. There is an +example of this (alter.py) in the examples directory. + + +Advanced features +----------------- + +`buildxobj.py`__ +contains functions to build Form XObjects out of pages or rectangles on +pages. These may be reused in new PDFs essentially as if they were images. + +buildxobj is careful to cache any page used so that it only appears in +the output once. + +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/buildxobj.py + + +`toreportlab.py`__ +provides the makerl function, which will translate pdfrw objects into a +format which can be used with `reportlab `__. +It is normally used in conjunction with buildxobj, to be able to reuse +parts of existing PDFs when using reportlab. + +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/toreportlab.py + + +`pagemerge.py`__ builds on the foundation laid by buildxobj. It +contains classes to create a new page (or overlay an existing page) +using one or more rectangles from other pages. There are examples +showing its use for watermarking, scaling, 4-up output, splitting +each page in 2, etc. + +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/pagemerge.py + +`findobjs.py`__ contains code that can find specific kinds of objects +inside a PDF file. The extract.py example uses this module to create +a new PDF that places each image and Form XObject from a source PDF onto +its own page, e.g. for easy reuse with some of the other examples or +with reportlab. + +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/findobjs.py + + +Miscellaneous +---------------- + +`compress.py`__ and `uncompress.py`__ +contains compression and decompression functions. Very few filters are +currently supported, so an external tool like pdftk might be good if you +require the ability to decompress (or, for that matter, decrypt) PDF +files. + +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/compress.py +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/uncompress.py + + +`py23_diffs.py`__ contains code to help manage the differences between +Python 2 and Python 3. + +__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/py23_diffs.py + +Testing +=============== + +The tests associated with pdfrw require a large number of PDFs, +which are not distributed with the library. + +To run the tests: + +* Download or clone the full package from github.com/pmaupin/pdfrw +* cd into the tests directory, and then clone the package + github.com/pmaupin/static_pdfs into a subdirectory (also named + static_pdfs). +* Now the tests may be run from that directory using unittest, or + py.test, or nose. +* travisci is used at github, and runs the tests with py.test + +Other libraries +===================== + +Pure Python +----------- + +- `reportlab `__ + + reportlab is must-have software if you want to programmatically + generate arbitrary PDFs. + +- `pyPdf `__ + + pyPdf is, in some ways, very full-featured. It can do decompression + and decryption and seems to know a lot about items inside at least + some kinds of PDF files. In comparison, pdfrw knows less about + specific PDF file features (such as metadata), but focuses on trying + to have a more Pythonic API for mapping the PDF file container + syntax to Python, and (IMO) has a simpler and better PDF file + parser. The Form XObject capability of pdfrw means that, in many + cases, it does not actually need to decompress objects -- they + can be left compressed. + +- `pdftools `__ + + pdftools feels large and I fell asleep trying to figure out how it + all fit together, but many others have done useful things with it. + +- `pagecatcher `__ + + My understanding is that pagecatcher would have done exactly what I + wanted when I built pdfrw. But I was on a zero budget, so I've never + had the pleasure of experiencing pagecatcher. I do, however, use and + like `reportlab `__ (open source, from + the people who make pagecatcher) so I'm sure pagecatcher is great, + better documented and much more full-featured than pdfrw. + +- `pdfminer `__ + + This looks like a useful, actively-developed program. It is quite + large, but then, it is trying to actively comprehend a full PDF + document. From the website: + + "PDFMiner is a suite of programs that help extracting and analyzing + text data of PDF documents. Unlike other PDF-related tools, it + allows to obtain the exact location of texts in a page, as well as + other extra information such as font information or ruled lines. It + includes a PDF converter that can transform PDF files into other + text formats (such as HTML). It has an extensible PDF parser that + can be used for other purposes instead of text analysis." + +non-pure-Python libraries +------------------------- + +- `pyPoppler `__ can read PDF + files. +- `pycairo `__ can write PDF + files. + +Other tools +----------- + +- `pdftk `__ is a wonderful command + line tool for basic PDF manipulation. It complements pdfrw extremely + well, supporting many operations such as decryption and decompression + that pdfrw cannot do. + +Release information +======================= + +Revisions: + +0.2 -- Released 21 June, 2015. Supports Python 2.6, 2.7, 3.3, and 3.4. + + - Several bugs have been fixed + - New regression test functionally tests core with dozens of + PDFs, and also tests examples. + - Core has been ported and tested on Python3 by round-tripping + several difficult files and observing binary matching results + across the different Python versions. + - Still only minimal support for compression and no support + for encryption or newer PDF features. (pdftk is useful + to put PDFs in a form that pdfrw can use.) + +0.1 -- Released to PyPI in 2012. Supports Python 2.5 - 2.7 + diff --git a/debian/changelog b/debian/changelog new file mode 100644 index 0000000..7401c0a --- /dev/null +++ b/debian/changelog @@ -0,0 +1,68 @@ +pdfrw (0.2-2) unstable; urgency=medium + + * QA upload + * Don’t run testsuite at build time (closes: #808678): + They rely on an external library containing data without any documented + copyright + + -- David Prévot Mon, 28 Dec 2015 22:05:28 -0400 + +pdfrw (0.2-1) unstable; urgency=medium + + * new upstream version (closes: #789590) + * add debian/watch + * bump debian/compat to 9 + * upstream changed from code.google.com to github.com + * depend on debhelper >= 9 + * demote python-reportlab to Suggests + * build python3-pdfrw and python-pdfrw-doc following + https://wiki.debian.org/Python/LibraryStyleGuide + * set Vcs-Git and Vcs-Browser to dgit + + -- Johannes Schauer Sat, 10 Oct 2015 01:38:35 +0200 + +pdfrw (0.1-3) unstable; urgency=medium + + * QA upload. + * Build using dh_python2 + + -- Matthias Klose Sun, 13 Jul 2014 15:50:59 +0000 + +pdfrw (0.1-2) unstable; urgency=medium + + * Orphaning package. + + -- Chris Lamb Sun, 09 Feb 2014 00:05:27 +0000 + +pdfrw (0.1-1) unstable; urgency=low + + * New upstream release. + + -- Chris Lamb Tue, 16 Oct 2012 07:54:53 +0100 + +pdfrw (0+svn136-4) unstable; urgency=low + + * Correct Homepage field. (Closes: #683165) + * Specify a 'name' kwarg in call to setuptools.setup. + + -- Chris Lamb Tue, 31 Jul 2012 02:41:14 -0700 + +pdfrw (0+svn136-3) unstable; urgency=low + + * python-pdfrw should Replaces/Provides/Conflicts pdfrw. Thanks to intrigeri + . (Closes: #639273) + + -- Chris Lamb Fri, 26 Aug 2011 10:48:38 +0100 + +pdfrw (0+svn136-2) unstable; urgency=low + + * Rename binary package to "python-pdfrw". + * Change Section to "python". + + -- Chris Lamb Tue, 23 Aug 2011 15:17:20 +0100 + +pdfrw (0+svn136-1) unstable; urgency=low + + * Initial release. (Closes: #638862) + + -- Chris Lamb Mon, 22 Aug 2011 16:09:03 +0100 diff --git a/debian/compat b/debian/compat new file mode 100644 index 0000000..ec63514 --- /dev/null +++ b/debian/compat @@ -0,0 +1 @@ +9 diff --git a/debian/control b/debian/control new file mode 100644 index 0000000..c30e83f --- /dev/null +++ b/debian/control @@ -0,0 +1,83 @@ +Source: pdfrw +Section: python +Priority: optional +Maintainer: Debian QA Group +Build-Depends: debhelper (>= 9), python-setuptools, dh-python, python-all (>= 2.6.6-3~), python-setuptools, python3-all, python3-setuptools +Standards-Version: 3.9.2 +Homepage: https://github.com/pmaupin/pdfrw +Vcs-Git: https://git.dgit.debian.org/botch +Vcs-Browser: https://browse.dgit.debian.org/botch.git/ +X-Python-Version: >= 2.6 +X-Python3-Version: >= 3.2 + +Package: python-pdfrw +Architecture: all +Depends: ${misc:Depends}, ${python:Depends} +Suggests: python-reportlab, python-pdfrw-doc +Replaces: pdfrw +Provides: pdfrw +Conflicts: pdfrw +Description: PDF file manipulation library (Python 2) + pdfrw can read and write PDF files, and can also be used to read in PDFs which + can then be used inside reportlab. + . + pdfrw tries to be agnostic about the contents of PDF files, and support them + as containers, but to do useful work, something a little higher-level is + required. It supports the following: + . + * PDF pages. pdfrw knows enough to find the pages in PDF files you read in, + and to write a set of pages back out to a new PDF file. + * Form XObjects. pdfrw can take any page or rectangle on a page, and convert + it to a Form XObject, suitable for use inside another PDF file + * reportlab objects. pdfrw can recursively create a set of reportlab objects + from its internal object format. This allows, for example, Form XObjects to + be used inside reportlab. + . + This package installs the library for Python 2. + +Package: python3-pdfrw +Architecture: all +Depends: ${misc:Depends}, ${python3:Depends} +Suggests: python-reportlab, python-pdfrw-doc +Replaces: pdfrw +Provides: pdfrw +Conflicts: pdfrw +Description: PDF file manipulation library (Python 3) + pdfrw can read and write PDF files, and can also be used to read in PDFs which + can then be used inside reportlab. + . + pdfrw tries to be agnostic about the contents of PDF files, and support them + as containers, but to do useful work, something a little higher-level is + required. It supports the following: + . + * PDF pages. pdfrw knows enough to find the pages in PDF files you read in, + and to write a set of pages back out to a new PDF file. + * Form XObjects. pdfrw can take any page or rectangle on a page, and convert + it to a Form XObject, suitable for use inside another PDF file + * reportlab objects. pdfrw can recursively create a set of reportlab objects + from its internal object format. This allows, for example, Form XObjects to + be used inside reportlab. + . + This package installs the library for Python 3. + +Package: python-pdfrw-doc +Architecture: all +Depends: ${misc:Depends} +Section: doc +Description: PDF file manipulation library (documentation) + pdfrw can read and write PDF files, and can also be used to read in PDFs which + can then be used inside reportlab. + . + pdfrw tries to be agnostic about the contents of PDF files, and support them + as containers, but to do useful work, something a little higher-level is + required. It supports the following: + . + * PDF pages. pdfrw knows enough to find the pages in PDF files you read in, + and to write a set of pages back out to a new PDF file. + * Form XObjects. pdfrw can take any page or rectangle on a page, and convert + it to a Form XObject, suitable for use inside another PDF file + * reportlab objects. pdfrw can recursively create a set of reportlab objects + from its internal object format. This allows, for example, Form XObjects to + be used inside reportlab. + . + This is the common documentation package. diff --git a/debian/copyright b/debian/copyright new file mode 100644 index 0000000..018216a --- /dev/null +++ b/debian/copyright @@ -0,0 +1,44 @@ +Author: Patrick Maupin +Download: https://github.com/pmaupin/pdfrw + +Files: * +Copyright: © 2006-2009 Patrick Maupin +License: MIT + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + . + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + . + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + +Files: debian/* +Copyright: © 2011 Chris Lamb +License: MIT + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + . + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + . + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. diff --git a/debian/python-pdfrw-doc.examples b/debian/python-pdfrw-doc.examples new file mode 100644 index 0000000..e39721e --- /dev/null +++ b/debian/python-pdfrw-doc.examples @@ -0,0 +1 @@ +examples/* diff --git a/debian/rules b/debian/rules new file mode 100755 index 0000000..ba73a3a --- /dev/null +++ b/debian/rules @@ -0,0 +1,8 @@ +#!/usr/bin/make -f + +export PYBUILD_NAME = pdfrw + +%: + dh $@ --with python2,python3 --buildsystem=pybuild + +override_dh_auto_test: diff --git a/debian/source/format b/debian/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/debian/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/debian/watch b/debian/watch new file mode 100644 index 0000000..5995029 --- /dev/null +++ b/debian/watch @@ -0,0 +1,3 @@ +version=3 +opts="filenamemangle=s/(?:.*?)?v?(\d[\d.]*)\.tar\.gz/pdfrw-$1.tar.gz/" \ + https://github.com/pmaupin/pdfrw/tags (?:.*?/)?v?(\d[\d.]*)\.tar\.gz diff --git a/examples/4up.py b/examples/4up.py new file mode 100755 index 0000000..ad2bd3b --- /dev/null +++ b/examples/4up.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python + +''' +usage: 4up.py my.pdf + +Creates 4up.my.pdf with a single output page for every +4 input pages. +''' + +import sys +import os + +from pdfrw import PdfReader, PdfWriter, PageMerge + + +def get4(srcpages): + scale = 0.5 + srcpages = PageMerge() + srcpages + x_increment, y_increment = (scale * i for i in srcpages.xobj_box[2:]) + for i, page in enumerate(srcpages): + page.scale(scale) + page.x = x_increment if i & 1 else 0 + page.y = 0 if i & 2 else y_increment + return srcpages.render() + + +inpfn, = sys.argv[1:] +outfn = '4up.' + os.path.basename(inpfn) +pages = PdfReader(inpfn).pages +writer = PdfWriter() +for index in range(0, len(pages), 4): + writer.addpage(get4(pages[index:index + 4])) +writer.write(outfn) diff --git a/examples/README.txt b/examples/README.txt new file mode 100644 index 0000000..242f5be --- /dev/null +++ b/examples/README.txt @@ -0,0 +1,32 @@ +Example programs: + +4up.py -- Prints pages four-up + +alter.py -- Simple example of making a very slight modification to a PDF. + +booklet.py -- Converts a PDF into a booklet. + +metadata.py -- Concatenates multiple PDFs, adds metadata. + +poster.py -- Changes the size of a PDF to create a poster + +print_two.py -- this is used when printing two cut-down copies on a single sheet of paper (double-sided) Requires uncompressed PDF. + +rotate.py -- This will rotate selected ranges of pages within a document. + +subset.py -- This will retrieve a subset of pages from a document. + +watermark.py -- Adds a watermark to a PDF + +rl1/4up.py -- Same as 4up.py, using reportlab for output. Next simplest reportlab example. + +rl1/booklet.py -- Version of print_booklet using reportlab for output. + +rl1/platypus_pdf_template.py -- Example using a PDF page as a watermark background with reportlab. + +rl1/subset.py -- Same as subset.py, using reportlab for output. Simplest reportlab example. + +rl2/copy.py -- example of how you could parse a graphics stream and then use reportlab for output. + Works on a few different PDFs, probably not a suitable starting point for real + production work without a lot of work on the library functions. + diff --git a/examples/alter.py b/examples/alter.py new file mode 100755 index 0000000..45b9c76 --- /dev/null +++ b/examples/alter.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +''' +usage: alter.py my.pdf + +Creates alter.my.pdf + +Demonstrates making a slight alteration to a preexisting PDF file. + +''' + +import sys +import os + +from pdfrw import PdfReader, PdfWriter + +inpfn, = sys.argv[1:] +outfn = 'alter.' + os.path.basename(inpfn) + +trailer = PdfReader(inpfn) +trailer.Info.Title = 'My New Title Goes Here' +writer = PdfWriter() +writer.trailer = trailer +writer.write(outfn) diff --git a/examples/booklet.py b/examples/booklet.py new file mode 100755 index 0000000..4758b08 --- /dev/null +++ b/examples/booklet.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python + +''' +usage: booklet.py my.pdf + +Creates booklet.my.pdf + +Pages organized in a form suitable for booklet printing, e.g. +to print 4 8.5x11 pages using a single 11x17 sheet (double-sided). +''' + +import sys +import os + +from pdfrw import PdfReader, PdfWriter, PageMerge + + +def fixpage(*pages): + result = PageMerge() + (x for x in pages if x is not None) + result[-1].x += result[0].w + return result.render() + + +inpfn, = sys.argv[1:] +outfn = 'booklet.' + os.path.basename(inpfn) +ipages = PdfReader(inpfn).pages + +# Make sure we have an even number +if len(ipages) & 1: + ipages.append(None) + +opages = [] +while len(ipages) > 2: + opages.append(fixpage(ipages.pop(), ipages.pop(0))) + opages.append(fixpage(ipages.pop(0), ipages.pop())) + +opages += ipages + +PdfWriter().addpages(opages).write(outfn) diff --git a/examples/cat.py b/examples/cat.py new file mode 100755 index 0000000..86cf643 --- /dev/null +++ b/examples/cat.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python + +''' +usage: cat.py [ ...] + +Creates cat. + +This file demonstrates two features: + +1) Concatenating multiple input PDFs. + +2) adding metadata to the PDF. + +''' + +import sys +import os + +from pdfrw import PdfReader, PdfWriter, IndirectPdfDict + +inputs = sys.argv[1:] +assert inputs +outfn = 'cat.' + os.path.basename(inputs[0]) + +writer = PdfWriter() +for inpfn in inputs: + writer.addpages(PdfReader(inpfn).pages) + +writer.trailer.Info = IndirectPdfDict( + Title='your title goes here', + Author='your name goes here', + Subject='what is it all about?', + Creator='some script goes here', +) +writer.write(outfn) diff --git a/examples/extract.py b/examples/extract.py new file mode 100755 index 0000000..3756b4f --- /dev/null +++ b/examples/extract.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python + +''' +usage: extract.py + +Locates Form XObjects and Image XObjects within the PDF, +and creates a new PDF containing these -- one per page. + +Resulting file will be named extract. + +''' + +import sys +import os + +from pdfrw import PdfReader, PdfWriter +from pdfrw.findobjs import page_per_xobj + + +inpfn, = sys.argv[1:] +outfn = 'extract.' + os.path.basename(inpfn) +pages = list(page_per_xobj(PdfReader(inpfn).pages, margin=0.5*72)) +if not pages: + raise IndexError("No XObjects found") +writer = PdfWriter() +writer.addpages(pages) +writer.write(outfn) diff --git a/examples/poster.py b/examples/poster.py new file mode 100755 index 0000000..7f1c1c2 --- /dev/null +++ b/examples/poster.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +''' +usage: poster.py my.pdf + +Shows how to change the size on a PDF. + +Motivation: + +My daughter needed to create a 48" x 36" poster, but her Mac +version of Powerpoint only wanted to output 8.5" x 11" for +some reason. + +So she did an 8.5x11" output with 0.5" margin all around +(actual size of useful area 7.5x10") and we scaled it +up by 4.8. + +We also copy the Info dict to the new PDF. + +''' + +import sys +import os + +from pdfrw import PdfReader, PdfWriter, PageMerge, IndirectPdfDict + + +def adjust(page, margin=36, scale=4.8): + info = PageMerge().add(page) + x1, y1, x2, y2 = info.xobj_box + viewrect = (margin, margin, x2 - x1 - 2 * margin, y2 - y1 - 2 * margin) + page = PageMerge().add(page, viewrect=viewrect) + page[0].scale(scale) + return page.render() + + +inpfn, = sys.argv[1:] +outfn = 'poster.' + os.path.basename(inpfn) +reader = PdfReader(inpfn) +writer = PdfWriter() +writer.addpage(adjust(reader.pages[0])) +writer.trailer.Info = IndirectPdfDict(reader.Info or {}) +writer.write(outfn) diff --git a/examples/print_two.py b/examples/print_two.py new file mode 100755 index 0000000..c54eaee --- /dev/null +++ b/examples/print_two.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python + +''' +usage: print_two.py my.pdf + +Creates print_two.my.pdf + +This is only useful when you can cut down sheets of paper to make two +small documents. Works for double-sided only right now. +''' + +import sys +import os + +from pdfrw import PdfReader, PdfWriter, PageMerge + + +def fixpage(page, count=[0]): + count[0] += 1 + oddpage = (count[0] & 1) + + result = PageMerge() + for rotation in (180 + 180 * oddpage, 180 * oddpage): + result.add(page, rotate=rotation) + result[1].x = result[0].w + return result.render() + + +inpfn, = sys.argv[1:] +outfn = 'print_two.' + os.path.basename(inpfn) +pages = PdfReader(inpfn).pages +PdfWriter().addpages(fixpage(x) for x in pages).write(outfn) diff --git a/examples/rl1/4up.py b/examples/rl1/4up.py new file mode 100755 index 0000000..49f766e --- /dev/null +++ b/examples/rl1/4up.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python + +''' +usage: 4up.py my.pdf + + +Uses Form XObjects and reportlab to create 4up.my.pdf. + +Demonstrates use of pdfrw with reportlab. + +''' + +import sys +import os + +from reportlab.pdfgen.canvas import Canvas + +from pdfrw import PdfReader +from pdfrw.buildxobj import pagexobj +from pdfrw.toreportlab import makerl + + +def addpage(canvas, allpages): + pages = allpages[:4] + del allpages[:4] + + x_max = max(page.BBox[2] for page in pages) + y_max = max(page.BBox[3] for page in pages) + + canvas.setPageSize((x_max, y_max)) + + for index, page in enumerate(pages): + x = x_max * (index & 1) / 2.0 + y = y_max * (index <= 1) / 2.0 + canvas.saveState() + canvas.translate(x, y) + canvas.scale(0.5, 0.5) + canvas.doForm(makerl(canvas, page)) + canvas.restoreState() + canvas.showPage() + + +def go(argv): + inpfn, = argv + outfn = '4up.' + os.path.basename(inpfn) + + pages = PdfReader(inpfn).pages + pages = [pagexobj(x) for x in pages] + canvas = Canvas(outfn) + + while pages: + addpage(canvas, pages) + canvas.save() + +if __name__ == '__main__': + go(sys.argv[1:]) diff --git a/examples/rl1/README.txt b/examples/rl1/README.txt new file mode 100644 index 0000000..11cb541 --- /dev/null +++ b/examples/rl1/README.txt @@ -0,0 +1,9 @@ +This directory contains example scripts which read in PDFs +and convert pages to PDF Form XObjects using pdfrw, and then +write out the PDFs using reportlab. + +The examples, from easiest to hardest, are: + +subset.py -- prints a subset of pages +4up.py -- prints pages 4-up +booklet.py -- creates a booklet out of the pages diff --git a/examples/rl1/booklet.py b/examples/rl1/booklet.py new file mode 100755 index 0000000..e7764a0 --- /dev/null +++ b/examples/rl1/booklet.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python + +''' +usage: booklet.py my.pdf + + +Uses Form XObjects and reportlab to create booklet.my.pdf. + +Demonstrates use of pdfrw with reportlab. + +''' + +import sys +import os + +from reportlab.pdfgen.canvas import Canvas + +from pdfrw import PdfReader +from pdfrw.buildxobj import pagexobj +from pdfrw.toreportlab import makerl + + +def read_and_double(inpfn): + pages = PdfReader(inpfn).pages + pages = [pagexobj(x) for x in pages] + if len(pages) & 1: + pages.append(pages[0]) # Sentinel -- get same size for back as front + + xobjs = [] + while len(pages) > 2: + xobjs.append((pages.pop(), pages.pop(0))) + xobjs.append((pages.pop(0), pages.pop())) + xobjs += [(x,) for x in pages] + return xobjs + + +def make_pdf(outfn, xobjpairs): + canvas = Canvas(outfn) + for xobjlist in xobjpairs: + x = y = 0 + for xobj in xobjlist: + x += xobj.BBox[2] + y = max(y, xobj.BBox[3]) + + canvas.setPageSize((x, y)) + + # Handle blank back page + if len(xobjlist) > 1 and xobjlist[0] == xobjlist[-1]: + xobjlist = xobjlist[:1] + x = xobjlist[0].BBox[2] + else: + x = 0 + y = 0 + + for xobj in xobjlist: + canvas.saveState() + canvas.translate(x, y) + canvas.doForm(makerl(canvas, xobj)) + canvas.restoreState() + x += xobj.BBox[2] + canvas.showPage() + canvas.save() + + +inpfn, = sys.argv[1:] +outfn = 'booklet.' + os.path.basename(inpfn) + +make_pdf(outfn, read_and_double(inpfn)) diff --git a/examples/rl1/platypus_pdf_template.py b/examples/rl1/platypus_pdf_template.py new file mode 100755 index 0000000..7e4769a --- /dev/null +++ b/examples/rl1/platypus_pdf_template.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +usage: platypus_pdf_template.py source.pdf + +Creates platypus.source.pdf + +Example of using pdfrw to use page 1 of a source PDF as the background +for other pages programmatically generated with Platypus. + +Contributed by user asannes + +""" +import sys +import os + +from reportlab.platypus import PageTemplate, BaseDocTemplate, Frame +from reportlab.platypus import NextPageTemplate, Paragraph, PageBreak +from reportlab.platypus.tableofcontents import TableOfContents +from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle +from reportlab.rl_config import defaultPageSize +from reportlab.lib.units import inch +from reportlab.graphics import renderPDF + +from pdfrw import PdfReader +from pdfrw.buildxobj import pagexobj +from pdfrw.toreportlab import makerl + +PAGE_WIDTH = defaultPageSize[0] +PAGE_HEIGHT = defaultPageSize[1] + + +class MyTemplate(PageTemplate): + """The kernel of this example, where we use pdfrw to fill in the + background of a page before writing to it. This could be used to fill + in a water mark or similar.""" + + def __init__(self, pdf_template_filename, name=None): + frames = [Frame( + 0.85 * inch, + 0.5 * inch, + PAGE_WIDTH - 1.15 * inch, + PAGE_HEIGHT - (1.5 * inch) + )] + PageTemplate.__init__(self, name, frames) + # use first page as template + page = PdfReader(pdf_template_filename).pages[0] + self.page_template = pagexobj(page) + # Scale it to fill the complete page + self.page_xscale = PAGE_WIDTH/self.page_template.BBox[2] + self.page_yscale = PAGE_HEIGHT/self.page_template.BBox[3] + + def beforeDrawPage(self, canvas, doc): + """Draws the background before anything else""" + canvas.saveState() + rl_obj = makerl(canvas, self.page_template) + canvas.scale(self.page_xscale, self.page_yscale) + canvas.doForm(rl_obj) + canvas.restoreState() + + +class MyDocTemplate(BaseDocTemplate): + """Used to apply heading to table of contents.""" + + def afterFlowable(self, flowable): + """Adds Heading1 to table of contents""" + if flowable.__class__.__name__ == 'Paragraph': + style = flowable.style.name + text = flowable.getPlainText() + key = '%s' % self.seq.nextf('toc') + if style == 'Heading1': + self.canv.bookmarkPage(key) + self.notify('TOCEntry', [1, text, self.page, key]) + + +def create_toc(): + """Creates the table of contents""" + table_of_contents = TableOfContents() + table_of_contents.dotsMinLevel = 0 + header1 = ParagraphStyle(name='Heading1', fontSize=16, leading=16) + header2 = ParagraphStyle(name='Heading2', fontSize=14, leading=14) + table_of_contents.levelStyles = [header1, header2] + return [table_of_contents, PageBreak()] + + +def create_pdf(filename, pdf_template_filename): + """Create the pdf, with all the contents""" + pdf_report = open(filename, "wb") + document = MyDocTemplate(pdf_report) + templates = [MyTemplate(pdf_template_filename, name='background')] + document.addPageTemplates(templates) + + styles = getSampleStyleSheet() + elements = [NextPageTemplate('background')] + elements.extend(create_toc()) + + # Dummy content (hello world x 200) + for i in range(200): + elements.append(Paragraph("Hello World" + str(i), styles['Heading1'])) + + document.multiBuild(elements) + pdf_report.close() + + +if __name__ == '__main__': + template, = sys.argv[1:] + output = 'platypus_pdf_template.' + os.path.basename(template) + create_pdf(output, template) diff --git a/examples/rl1/subset.py b/examples/rl1/subset.py new file mode 100755 index 0000000..c05056c --- /dev/null +++ b/examples/rl1/subset.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python + +''' +usage: subset.py my.pdf firstpage lastpage + +Creates subset__to_.my.pdf + + +Uses Form XObjects and reportlab to create output file. + +Demonstrates use of pdfrw with reportlab. + +''' + +import sys +import os + +from reportlab.pdfgen.canvas import Canvas + +from pdfrw import PdfReader +from pdfrw.buildxobj import pagexobj +from pdfrw.toreportlab import makerl + + +def go(inpfn, firstpage, lastpage): + firstpage, lastpage = int(firstpage), int(lastpage) + outfn = 'subset.' + os.path.basename(inpfn) + + pages = PdfReader(inpfn).pages + pages = [pagexobj(x) for x in pages[firstpage - 1:lastpage]] + canvas = Canvas(outfn) + + for page in pages: + canvas.setPageSize((page.BBox[2], page.BBox[3])) + canvas.doForm(makerl(canvas, page)) + canvas.showPage() + + canvas.save() + +if __name__ == '__main__': + inpfn, firstpage, lastpage = sys.argv[1:] + go(inpfn, firstpage, lastpage) diff --git a/examples/rl2/README.txt b/examples/rl2/README.txt new file mode 100644 index 0000000..6d3b590 --- /dev/null +++ b/examples/rl2/README.txt @@ -0,0 +1,5 @@ +The copy.py demo in this directory parses the graphics stream from the PDF and actually plays it back through reportlab. + +Doesn't yet handle fonts or unicode very well. + +For a more practical demo, look at the Form XObjects approach in the examples/rl1 directory. diff --git a/examples/rl2/copy.py b/examples/rl2/copy.py new file mode 100755 index 0000000..66fe0c6 --- /dev/null +++ b/examples/rl2/copy.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python + +''' +usage: copy.py my.pdf + +Creates copy.my.pdf + +Uses somewhat-functional parser. For better results +for most things, see the Form XObject-based method. + +''' + +import sys +import os + +from reportlab.pdfgen.canvas import Canvas + +from decodegraphics import parsepage +from pdfrw import PdfReader, PdfWriter, PdfArray + +inpfn, = sys.argv[1:] +outfn = 'copy.' + os.path.basename(inpfn) +pages = PdfReader(inpfn, decompress=True).pages +canvas = Canvas(outfn, pageCompression=0) + +for page in pages: + box = [float(x) for x in page.MediaBox] + assert box[0] == box[1] == 0, "demo won't work on this PDF" + canvas.setPageSize(box[2:]) + parsepage(page, canvas) + canvas.showPage() +canvas.save() diff --git a/examples/rl2/decodegraphics.py b/examples/rl2/decodegraphics.py new file mode 100644 index 0000000..e2f3a9f --- /dev/null +++ b/examples/rl2/decodegraphics.py @@ -0,0 +1,444 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' +This file is an example parser that will parse a graphics stream +into a reportlab canvas. + +Needs work on fonts and unicode, but works on a few PDFs. + +Better to use Form XObjects for most things (see the example in rl1). + +''' +from inspect import getargspec + +from pdfrw import PdfTokens +from pdfrw.objects import PdfString + +############################################################################# +# Graphics parsing + + +def parse_array(self, token='[', params=None): + mylist = [] + for token in self.tokens: + if token == ']': + break + mylist.append(token) + self.params.append(mylist) + + +def parse_savestate(self, token='q', params=''): + self.canv.saveState() + + +def parse_restorestate(self, token='Q', params=''): + self.canv.restoreState() + + +def parse_transform(self, token='cm', params='ffffff'): + self.canv.transform(*params) + + +def parse_linewidth(self, token='w', params='f'): + self.canv.setLineWidth(*params) + + +def parse_linecap(self, token='J', params='i'): + self.canv.setLineCap(*params) + + +def parse_linejoin(self, token='j', params='i'): + self.canv.setLineJoin(*params) + + +def parse_miterlimit(self, token='M', params='f'): + self.canv.setMiterLimit(*params) + + +def parse_dash(self, token='d', params='as'): # Array, string + self.canv.setDash(*params) + + +def parse_intent(self, token='ri', params='n'): + # TODO: add logging + pass + + +def parse_flatness(self, token='i', params='i'): + # TODO: add logging + pass + + +def parse_gstate(self, token='gs', params='n'): + # TODO: add logging + # Could parse stuff we care about from here later + pass + + +def parse_move(self, token='m', params='ff'): + if self.gpath is None: + self.gpath = self.canv.beginPath() + self.gpath.moveTo(*params) + self.current_point = params + + +def parse_line(self, token='l', params='ff'): + self.gpath.lineTo(*params) + self.current_point = params + + +def parse_curve(self, token='c', params='ffffff'): + self.gpath.curveTo(*params) + self.current_point = params[-2:] + + +def parse_curve1(self, token='v', params='ffff'): + parse_curve(self, token, tuple(self.current_point) + tuple(params)) + + +def parse_curve2(self, token='y', params='ffff'): + parse_curve(self, token, tuple(params) + tuple(params[-2:])) + + +def parse_close(self, token='h', params=''): + self.gpath.close() + + +def parse_rect(self, token='re', params='ffff'): + if self.gpath is None: + self.gpath = self.canv.beginPath() + self.gpath.rect(*params) + self.current_point = params[-2:] + + +def parse_stroke(self, token='S', params=''): + finish_path(self, 1, 0, 0) + + +def parse_close_stroke(self, token='s', params=''): + self.gpath.close() + finish_path(self, 1, 0, 0) + + +def parse_fill(self, token='f', params=''): + finish_path(self, 0, 1, 1) + + +def parse_fill_compat(self, token='F', params=''): + finish_path(self, 0, 1, 1) + + +def parse_fill_even_odd(self, token='f*', params=''): + finish_path(self, 0, 1, 0) + + +def parse_fill_stroke_even_odd(self, token='B*', params=''): + finish_path(self, 1, 1, 0) + + +def parse_fill_stroke(self, token='B', params=''): + finish_path(self, 1, 1, 1) + + +def parse_close_fill_stroke_even_odd(self, token='b*', params=''): + self.gpath.close() + finish_path(self, 1, 1, 0) + + +def parse_close_fill_stroke(self, token='b', params=''): + self.gpath.close() + finish_path(self, 1, 1, 1) + + +def parse_nop(self, token='n', params=''): + finish_path(self, 0, 0, 0) + + +def finish_path(self, stroke, fill, fillmode): + if self.gpath is not None: + canv = self.canv + canv._fillMode, oldmode = fillmode, canv._fillMode + canv.drawPath(self.gpath, stroke, fill) + canv._fillMode = oldmode + self.gpath = None + + +def parse_clip_path(self, token='W', params=''): + # TODO: add logging + pass + + +def parse_clip_path_even_odd(self, token='W*', params=''): + # TODO: add logging + pass + + +def parse_stroke_gray(self, token='G', params='f'): + self.canv.setStrokeGray(*params) + + +def parse_fill_gray(self, token='g', params='f'): + self.canv.setFillGray(*params) + + +def parse_stroke_rgb(self, token='RG', params='fff'): + self.canv.setStrokeColorRGB(*params) + + +def parse_fill_rgb(self, token='rg', params='fff'): + self.canv.setFillColorRGB(*params) + + +def parse_stroke_cmyk(self, token='K', params='ffff'): + self.canv.setStrokeColorCMYK(*params) + + +def parse_fill_cmyk(self, token='k', params='ffff'): + self.canv.setFillColorCMYK(*params) + +############################################################################# +# Text parsing + + +def parse_begin_text(self, token='BT', params=''): + assert self.tpath is None + self.tpath = self.canv.beginText() + + +def parse_text_transform(self, token='Tm', params='ffffff'): + path = self.tpath + + # Stoopid optimization to remove nop + try: + code = path._code + except AttributeError: + pass + else: + if code[-1] == '1 0 0 1 0 0 Tm': + code.pop() + + path.setTextTransform(*params) + + +def parse_setfont(self, token='Tf', params='nf'): + fontinfo = self.fontdict[params[0]] + self.tpath._setFont(fontinfo.name, params[1]) + self.curfont = fontinfo + + +def parse_text_out(self, token='Tj', params='t'): + text = params[0].decode(self.curfont.remap, self.curfont.twobyte) + self.tpath.textOut(text) + + +def parse_TJ(self, token='TJ', params='a'): + remap = self.curfont.remap + twobyte = self.curfont.twobyte + result = [] + for x in params[0]: + if isinstance(x, PdfString): + result.append(x.decode(remap, twobyte)) + else: + # TODO: Adjust spacing between characters here + int(x) + text = ''.join(result) + self.tpath.textOut(text) + + +def parse_end_text(self, token='ET', params=''): + assert self.tpath is not None + self.canv.drawText(self.tpath) + self.tpath = None + + +def parse_move_cursor(self, token='Td', params='ff'): + self.tpath.moveCursor(params[0], -params[1]) + + +def parse_set_leading(self, token='TL', params='f'): + self.tpath.setLeading(*params) + + +def parse_text_line(self, token='T*', params=''): + self.tpath.textLine() + + +def parse_set_char_space(self, token='Tc', params='f'): + self.tpath.setCharSpace(*params) + + +def parse_set_word_space(self, token='Tw', params='f'): + self.tpath.setWordSpace(*params) + + +def parse_set_hscale(self, token='Tz', params='f'): + self.tpath.setHorizScale(params[0] - 100) + + +def parse_set_rise(self, token='Ts', params='f'): + self.tpath.setRise(*params) + + +def parse_xobject(self, token='Do', params='n'): + # TODO: Need to do this + pass + + +class FontInfo(object): + ''' Pretty basic -- needs a lot of work to work right for all fonts + ''' + lookup = { + # WRONG -- have to learn about font stuff... + 'BitstreamVeraSans': 'Helvetica', + } + + def __init__(self, source): + name = source.BaseFont[1:] + self.name = self.lookup.get(name, name) + self.remap = chr + self.twobyte = False + info = source.ToUnicode + if not info: + return + info = info.stream.split('beginbfchar')[1].split('endbfchar')[0] + info = list(PdfTokens(info)) + assert not len(info) & 1 + info2 = [] + for x in info: + assert x[0] == '<' and x[-1] == '>' and len(x) in (4, 6), x + i = int(x[1:-1], 16) + info2.append(i) + self.remap = dict((x, chr(y)) for (x, y) in + zip(info2[::2], info2[1::2])).get + self.twobyte = len(info[0]) > 4 + +############################################################################# +# Control structures + + +def findparsefuncs(): + + def checkname(n): + assert n.startswith('/') + return n + + def checkarray(a): + assert isinstance(a, list), a + return a + + def checktext(t): + assert isinstance(t, PdfString) + return t + + fixparam = dict(f=float, i=int, n=checkname, a=checkarray, + s=str, t=checktext) + fixcache = {} + + def fixlist(params): + try: + result = fixcache[params] + except KeyError: + result = tuple(fixparam[x] for x in params) + fixcache[params] = result + return result + + dispatch = {} + expected_args = 'self token params'.split() + for key, func in globals().items(): + if key.startswith('parse_'): + args, varargs, keywords, defaults = getargspec(func) + assert (args == expected_args and varargs is None and + keywords is None and len(defaults) == 2), ( + key, args, varargs, keywords, defaults) + token, params = defaults + if params is not None: + params = fixlist(params) + value = func, params + assert dispatch.setdefault(token, value) is value, repr(token) + return dispatch + + +class _ParseClass(object): + dispatch = findparsefuncs() + + @classmethod + def parsepage(cls, page, canvas=None): + self = cls() + contents = page.Contents + if contents.Filter is not None: + raise SystemExit('Cannot parse graphics -- page encoded with %s' + % contents.Filter) + dispatch = cls.dispatch.get + self.tokens = tokens = iter(PdfTokens(contents.stream)) + self.params = params = [] + self.canv = canvas + self.gpath = None + self.tpath = None + self.fontdict = dict((x, FontInfo(y)) for + (x, y) in page.Resources.Font.iteritems()) + + for token in self.tokens: + info = dispatch(token) + if info is None: + params.append(token) + continue + func, paraminfo = info + if paraminfo is None: + func(self, token, ()) + continue + delta = len(params) - len(paraminfo) + if delta: + if delta < 0: + print ('Operator %s expected %s parameters, got %s' % + (token, len(paraminfo), params)) + params[:] = [] + continue + else: + print ("Unparsed parameters/commands: %s" % params[:delta]) + del params[:delta] + paraminfo = zip(paraminfo, params) + try: + params[:] = [x(y) for (x, y) in paraminfo] + except: + for i, (x, y) in enumerate(paraminfo): + try: + x(y) + except: + raise # For now + continue + func(self, token, params) + params[:] = [] + + +def debugparser(undisturbed=set('parse_array'.split())): + def debugdispatch(): + def getvalue(oldval): + name = oldval[0].__name__ + + def myfunc(self, token, params): + print ('%s called %s(%s)' % (token, name, + ', '.join(str(x) for x in params))) + if name in undisturbed: + myfunc = oldval[0] + return myfunc, oldval[1] + return dict((x, getvalue(y)) + for (x, y) in _ParseClass.dispatch.iteritems()) + + class _DebugParse(_ParseClass): + dispatch = debugdispatch() + + return _DebugParse.parsepage + +parsepage = _ParseClass.parsepage + +if __name__ == '__main__': + import sys + from pdfreader import PdfReader + parse = debugparser() + fname, = sys.argv[1:] + pdf = PdfReader(fname) + for i, page in enumerate(pdf.pages): + print ('\nPage %s ------------------------------------' % i) + parse(page) diff --git a/examples/rotate.py b/examples/rotate.py new file mode 100755 index 0000000..8b10d05 --- /dev/null +++ b/examples/rotate.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python + +''' +usage: rotate.py my.pdf rotation [page[range] ...] + eg. rotate.py 270 1-3 5 7-9 + + Rotation must be multiple of 90 degrees, clockwise. + +Creates rotate.my.pdf with selected pages rotated. Rotates all by default. + +''' + +import sys +import os + +from pdfrw import PdfReader, PdfWriter + +inpfn = sys.argv[1] +rotate = sys.argv[2] +ranges = sys.argv[3:] + +rotate = int(rotate) +assert rotate % 90 == 0 + +ranges = [[int(y) for y in x.split('-')] for x in ranges] +outfn = 'rotate.%s' % os.path.basename(inpfn) +trailer = PdfReader(inpfn) +pages = trailer.pages + +if not ranges: + ranges = [[1, len(pages)]] + +for onerange in ranges: + onerange = (onerange + onerange[-1:])[:2] + for pagenum in range(onerange[0]-1, onerange[1]): + pages[pagenum].Rotate = (int(pages[pagenum].inheritable.Rotate or + 0) + rotate) % 360 + +outdata = PdfWriter() +outdata.trailer = trailer +outdata.write(outfn) diff --git a/examples/subset.py b/examples/subset.py new file mode 100755 index 0000000..30a577a --- /dev/null +++ b/examples/subset.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +''' +usage: subset.py my.pdf page[range] [page[range]] ... + eg. subset.py 1-3 5 7-9 + +Creates subset.my.pdf + +''' + +import sys +import os + +from pdfrw import PdfReader, PdfWriter + +inpfn = sys.argv[1] +ranges = sys.argv[2:] +assert ranges, "Expected at least one range" + +ranges = ([int(y) for y in x.split('-')] for x in ranges) +outfn = 'subset.%s' % os.path.basename(inpfn) +pages = PdfReader(inpfn).pages +outdata = PdfWriter() + +for onerange in ranges: + onerange = (onerange + onerange[-1:])[:2] + for pagenum in range(onerange[0], onerange[1]+1): + outdata.addpage(pages[pagenum-1]) +outdata.write(outfn) diff --git a/examples/unspread.py b/examples/unspread.py new file mode 100755 index 0000000..4b3bc5d --- /dev/null +++ b/examples/unspread.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python + +''' +usage: unspread.py my.pdf + +Creates unspread.my.pdf + +Chops each page in half, e.g. if a source were +created in booklet form, you could extract individual +pages. +''' + +import sys +import os + +from pdfrw import PdfReader, PdfWriter, PageMerge + + +def splitpage(src): + ''' Split a page into two (left and right) + ''' + # Yield a result for each half of the page + for x_pos in (0, 0.5): + yield PageMerge().add(src, viewrect=(x_pos, 0, 0.5, 1)).render() + + +inpfn, = sys.argv[1:] +outfn = 'unspread.' + os.path.basename(inpfn) +writer = PdfWriter() +for page in PdfReader(inpfn).pages: + writer.addpages(splitpage(page)) +writer.write(outfn) diff --git a/examples/watermark.py b/examples/watermark.py new file mode 100755 index 0000000..96b686b --- /dev/null +++ b/examples/watermark.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python + +''' +Simple example of watermarking using form xobjects (pdfrw). + +usage: watermark.py [-u] my.pdf single_page.pdf + +Creates watermark.my.pdf, with every page overlaid with +first page from single_page.pdf. If -u is selected, watermark +will be placed underneath page (painted first). + +NB: At one point, this example was extremely complicated, with + multiple options. That only led to errors in implementation, + so it has been re-simplified in order to show basic principles + of the library operation and to match the other examples better. +''' + +import sys +import os + +from pdfrw import PdfReader, PdfWriter, PageMerge + +argv = sys.argv[1:] +underneath = '-u' in argv +if underneath: + del argv[argv.index('-u')] +inpfn, wmarkfn = argv +outfn = 'watermark.' + os.path.basename(inpfn) +wmark = PageMerge().add(PdfReader(wmarkfn).pages[0])[0] +trailer = PdfReader(inpfn) +for page in trailer.pages: + PageMerge(page).add(wmark, prepend=underneath).render() +PdfWriter().write(outfn, trailer) diff --git a/pdfrw/__init__.py b/pdfrw/__init__.py new file mode 100644 index 0000000..a36a8cb --- /dev/null +++ b/pdfrw/__init__.py @@ -0,0 +1,22 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +from .pdfwriter import PdfWriter +from .pdfreader import PdfReader +from .objects import (PdfObject, PdfName, PdfArray, + PdfDict, IndirectPdfDict, PdfString) +from .tokens import PdfTokens +from .errors import PdfParseError +from .pagemerge import PageMerge + +__version__ = '0.2' + +# Add a tiny bit of compatibility to pyPdf + +PdfFileReader = PdfReader +PdfFileWriter = PdfWriter + +__all__ = [PdfWriter, PdfReader, PdfObject, PdfName, PdfArray, + PdfTokens, PdfParseError, PdfDict, IndirectPdfDict, + PdfString, PageMerge] diff --git a/pdfrw/buildxobj.py b/pdfrw/buildxobj.py new file mode 100644 index 0000000..d210c67 --- /dev/null +++ b/pdfrw/buildxobj.py @@ -0,0 +1,356 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' + +This module contains code to build PDF "Form XObjects". + +A Form XObject allows a fragment from one PDF file to be cleanly +included in another PDF file. + +Reference for syntax: "Parameters for opening PDF files" from SDK 8.1 + + http://www.adobe.com/devnet/acrobat/pdfs/pdf_open_parameters.pdf + + supported 'page=xxx', 'viewrect=,,,' + + Also supported by this, but not by Adobe: + 'rotate=xxx' where xxx in [0, 90, 180, 270] + + Units are in points + + +Reference for content: Adobe PDF reference, sixth edition, version 1.7 + + http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf + + Form xobjects discussed chapter 4.9, page 355 +''' + +from .objects import PdfDict, PdfArray, PdfName +from .pdfreader import PdfReader +from .errors import log, PdfNotImplementedError +from .py23_diffs import iteritems + + +class ViewInfo(object): + ''' Instantiate ViewInfo with a uri, and it will parse out + the filename, page, and viewrect into object attributes. + + Note 1: + Viewrects follow the adobe definition. (See reference + above). They are arrays of 4 numbers: + + - Distance from left of document in points + - Distance from top (NOT bottom) of document in points + - Width of rectangle in points + - Height of rectangle in points + + Note 2: + For simplicity, Viewrects can also be specified + in fractions of the document. If every number in + the viewrect is between 0 and 1 inclusive, then + viewrect elements 0 and 2 are multiplied by the + mediabox width before use, and viewrect elements + 1 and 3 are multiplied by the mediabox height before + use. + + Note 3: + By default, an XObject based on the view will be + cacheable. It should not be cacheable if the XObject + will be subsequently modified. + ''' + doc = None + docname = None + page = None + viewrect = None + rotate = None + cacheable = True + + def __init__(self, pageinfo='', **kw): + pageinfo = pageinfo.split('#', 1) + if len(pageinfo) == 2: + pageinfo[1:] = pageinfo[1].replace('&', '#').split('#') + for key in 'page viewrect'.split(): + if pageinfo[0].startswith(key + '='): + break + else: + self.docname = pageinfo.pop(0) + for item in pageinfo: + key, value = item.split('=') + key = key.strip() + value = value.replace(',', ' ').split() + if key in ('page', 'rotate'): + assert len(value) == 1 + setattr(self, key, int(value[0])) + elif key == 'viewrect': + assert len(value) == 4 + setattr(self, key, [float(x) for x in value]) + else: + log.error('Unknown option: %s', key) + for key, value in iteritems(kw): + assert hasattr(self, key), key + setattr(self, key, value) + + +def get_rotation(rotate): + ''' Return clockwise rotation code: + 0 = unrotated + 1 = 90 degrees + 2 = 180 degrees + 3 = 270 degrees + ''' + try: + rotate = int(rotate) + except (ValueError, TypeError): + return 0 + if rotate % 90 != 0: + return 0 + return rotate // 90 + + +def rotate_point(point, rotation): + ''' Rotate an (x,y) coordinate clockwise by a + rotation code specifying a multiple of 90 degrees. + ''' + if rotation & 1: + point = point[1], -point[0] + if rotation & 2: + point = -point[0], -point[1] + return point + + +def rotate_rect(rect, rotation): + ''' Rotate both points within the rectangle, then normalize + the rectangle by returning the new lower left, then new + upper right. + ''' + rect = rotate_point(rect[:2], rotation) + rotate_point(rect[2:], rotation) + return (min(rect[0], rect[2]), min(rect[1], rect[3]), + max(rect[0], rect[2]), max(rect[1], rect[3])) + + +def getrects(inheritable, pageinfo, rotation): + ''' Given the inheritable attributes of a page and + the desired pageinfo rectangle, return the page's + media box and the calculated boundary (clip) box. + ''' + mbox = tuple([float(x) for x in inheritable.MediaBox]) + cbox = tuple([float(x) for x in (inheritable.CropBox or mbox)]) + vrect = pageinfo.viewrect + if vrect is not None: + # Rotate the media box to match what the user sees, + # figure out the clipping box, then rotate back + mleft, mbot, mright, mtop = rotate_rect(cbox, rotation) + x, y, w, h = vrect + + # Support operations in fractions of a page + if 0 <= min(vrect) < max(vrect) <= 1: + mw = mright - mleft + mh = mtop - mbot + x *= mw + w *= mw + y *= mh + h *= mh + + cleft = mleft + x + ctop = mtop - y + cright = cleft + w + cbot = ctop - h + cbox = (max(mleft, cleft), max(mbot, cbot), + min(mright, cright), min(mtop, ctop)) + cbox = rotate_rect(cbox, -rotation) + return mbox, cbox + + +def _build_cache(contents, allow_compressed): + ''' Build a new dictionary holding the stream, + and save it along with private cache info. + Assumes validity has been pre-checked if + we have a non-None xobj_copy. + ''' + try: + xobj_copy = contents.xobj_copy + except AttributeError: + # Should have a PdfArray here... + array = contents + private = contents + else: + # Should have a PdfDict here -- might or might not have cache copy + if xobj_copy is not None: + return xobj_copy + array = [contents] + private = contents.private + + # The spec says nothing about nested arrays. Will + # assume that's not a problem until we encounter them... + + xobj_copy = PdfDict(array[0]) + xobj_copy.private.xobj_cachedict = {} + private.xobj_copy = xobj_copy + + if len(array) > 1: + newstream = '\n'.join(x.stream for x in array) + newlength = sum(int(x.Length) for x in array) + len(array) - 1 + assert newlength == len(newstream) + xobj_copy.stream = newstream + + # Cannot currently cope with different kinds of + # compression in the array, so just disallow it. + allow_compressed = False + + if not allow_compressed: + # Make sure there are no compression parameters + for cdict in array: + keys = [x[0] for x in iteritems(cdict)] + if len(keys) != 1: + raise PdfNotImplementedError( + 'Xobjects with compression parameters not supported: %s' % + keys) + return xobj_copy + + +def _cache_xobj(contents, resources, mbox, bbox, rotation, cacheable=True): + ''' Return a cached Form XObject, or create a new one and cache it. + Adds private members x, y, w, h + ''' + cachedict = contents.xobj_cachedict + cachekey = mbox, bbox, rotation + result = cachedict.get(cachekey) if cacheable else None + if result is None: + # If we are not getting a full page, or if we are going to + # modify the results, first retrieve an underlying Form XObject + # that represents the entire page, so that we are not copying + # the full page data into the new file multiple times + func = (_get_fullpage, _get_subpage)[mbox != bbox or not cacheable] + result = PdfDict( + func(contents, resources, mbox), + Type=PdfName.XObject, + Subtype=PdfName.Form, + FormType=1, + BBox=PdfArray(bbox), + ) + rect = bbox + if rotation: + matrix = (rotate_point((1, 0), rotation) + + rotate_point((0, 1), rotation)) + result.Matrix = PdfArray(matrix + (0, 0)) + rect = rotate_rect(rect, rotation) + + private = result.private + private.x = rect[0] + private.y = rect[1] + private.w = rect[2] - rect[0] + private.h = rect[3] - rect[1] + if cacheable: + cachedict[cachekey] = result + return result + + +def _get_fullpage(contents, resources, mbox): + ''' fullpage is easy. Just copy the contents, + set up the resources, and let _cache_xobj handle the + rest. + ''' + return PdfDict(contents, Resources=resources) + + +def _get_subpage(contents, resources, mbox): + ''' subpages *could* be as easy as full pages, but we + choose to complicate life by creating a Form XObject + for the page, and then one that references it for + the subpage, on the off-chance that we want multiple + items from the page. + ''' + return PdfDict( + stream='/FullPage Do\n', + Resources=PdfDict( + XObject=PdfDict( + FullPage=_cache_xobj(contents, resources, mbox, mbox, 0) + ) + ) + ) + + +def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True): + ''' pagexobj creates and returns a Form XObject for + a given view within a page (Defaults to entire page.) + + pagexobj is passed a page and a viewrect. + ''' + inheritable = page.inheritable + resources = inheritable.Resources + rotation = get_rotation(inheritable.Rotate) + mbox, bbox = getrects(inheritable, viewinfo, rotation) + rotation += get_rotation(viewinfo.rotate) + contents = _build_cache(page.Contents, allow_compressed) + return _cache_xobj(contents, resources, mbox, bbox, rotation, + viewinfo.cacheable) + + +def docxobj(pageinfo, doc=None, allow_compressed=True): + ''' docinfo reads a page out of a document and uses + pagexobj to create the Form XObject based on + the page. + + This is a convenience function for things like + rst2pdf that want to be able to pass in textual + filename/location descriptors and don't want to + know about using PdfReader. + + Can work standalone, or in conjunction with + the CacheXObj class (below). + + ''' + if not isinstance(pageinfo, ViewInfo): + pageinfo = ViewInfo(pageinfo) + + # If we're explicitly passed a document, + # make sure we don't have one implicitly as well. + # If no implicit or explicit doc, then read one in + # from the filename. + if doc is not None: + assert pageinfo.doc is None + pageinfo.doc = doc + elif pageinfo.doc is not None: + doc = pageinfo.doc + else: + doc = pageinfo.doc = PdfReader(pageinfo.docname, + decompress=not allow_compressed) + assert isinstance(doc, PdfReader) + + sourcepage = doc.pages[(pageinfo.page or 1) - 1] + return pagexobj(sourcepage, pageinfo, allow_compressed) + + +class CacheXObj(object): + ''' Use to keep from reparsing files over and over, + and to keep from making the output too much + bigger than it ought to be by replicating + unnecessary object copies. + + This is a convenience function for things like + rst2pdf that want to be able to pass in textual + filename/location descriptors and don't want to + know about using PdfReader. + ''' + def __init__(self, decompress=False): + ''' Set decompress true if you need + the Form XObjects to be decompressed. + Will decompress what it can and scream + about the rest. + ''' + self.cached_pdfs = {} + self.decompress = decompress + + def load(self, sourcename): + ''' Load a Form XObject from a uri + ''' + info = ViewInfo(sourcename) + fname = info.docname + pcache = self.cached_pdfs + doc = pcache.get(fname) + if doc is None: + doc = pcache[fname] = PdfReader(fname, decompress=self.decompress) + return docxobj(info, doc, allow_compressed=not self.decompress) diff --git a/pdfrw/compress.py b/pdfrw/compress.py new file mode 100644 index 0000000..0479131 --- /dev/null +++ b/pdfrw/compress.py @@ -0,0 +1,27 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' +Currently, this sad little file only knows how to decompress +using the flate (zlib) algorithm. Maybe more later, but it's +not a priority for me... +''' + +from .objects import PdfName +from .uncompress import streamobjects +from .py23_diffs import zlib + + +def compress(mylist): + flate = PdfName.FlateDecode + for obj in streamobjects(mylist): + ftype = obj.Filter + if ftype is not None: + continue + oldstr = obj.stream + newstr = zlib.compress(oldstr) + if len(newstr) < len(oldstr) + 30: + obj.stream = newstr + obj.Filter = flate + obj.DecodeParms = None diff --git a/pdfrw/errors.py b/pdfrw/errors.py new file mode 100644 index 0000000..263cd4d --- /dev/null +++ b/pdfrw/errors.py @@ -0,0 +1,38 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' +PDF Exceptions and error handling +''' + +import logging + + +logging.basicConfig( + format='[%(levelname)s] %(filename)s:%(lineno)d %(message)s', + level=logging.WARNING) + +log = logging.getLogger('pdfrw') + + +class PdfError(Exception): + "Abstract base class of exceptions thrown by this module" + + def __init__(self, msg): + self.msg = msg + + def __str__(self): + return self.msg + + +class PdfParseError(PdfError): + "Error thrown by parser/tokenizer" + + +class PdfOutputError(PdfError): + "Error thrown by PDF writer" + + +class PdfNotImplementedError(PdfError): + "Error thrown on missing features" diff --git a/pdfrw/findobjs.py b/pdfrw/findobjs.py new file mode 100644 index 0000000..f19ebdf --- /dev/null +++ b/pdfrw/findobjs.py @@ -0,0 +1,138 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' This module contains a function to find all the XObjects + in a document, and another function that will wrap them + in page objects. +''' + +from .objects import PdfDict, PdfArray, PdfName +from .pdfwriter import user_fmt + + +def find_objects(source, valid_types=(PdfName.XObject, None), + valid_subtypes=(PdfName.Form, PdfName.Image), + no_follow=(PdfName.Parent,), + isinstance=isinstance, id=id, sorted=sorted, + reversed=reversed, PdfDict=PdfDict): + ''' + Find all the objects of a particular kind in a document + or array. Defaults to looking for Form and Image XObjects. + + This could be done recursively, but some PDFs + are quite deeply nested, so we do it without + recursion. + + Note that we don't know exactly where things appear on pages, + but we aim for a sort order that is (a) mostly in document order, + and (b) reproducible. For arrays, objects are processed in + array order, and for dicts, they are processed in key order. + ''' + container = (PdfDict, PdfArray) + + # Allow passing a list of pages, or a dict + if isinstance(source, PdfDict): + source = [source] + else: + source = list(source) + + visited = set() + source.reverse() + while source: + obj = source.pop() + if not isinstance(obj, container): + continue + myid = id(obj) + if myid in visited: + continue + visited.add(myid) + if isinstance(obj, PdfDict): + if obj.Type in valid_types and obj.Subtype in valid_subtypes: + yield obj + obj = [y for (x, y) in sorted(obj.iteritems()) + if x not in no_follow] + else: + # TODO: This forces resolution of any indirect objects in + # the array. It may not be necessary. Don't know if + # reversed() does any voodoo underneath the hood. + # It's cheap enough for now, but might be removeable. + obj and obj[0] + source.extend(reversed(obj)) + + +def wrap_object(obj, width, margin): + ''' Wrap an xobj in its own page object. + ''' + fmt = 'q %s 0 0 %s %s %s cm /MyImage Do Q' + contents = PdfDict(indirect=True) + subtype = obj.Subtype + if subtype == PdfName.Form: + contents._stream = obj.stream + contents.Length = obj.Length + contents.Filter = obj.Filter + contents.DecodeParms = obj.DecodeParms + resources = obj.Resources + mbox = obj.BBox + elif subtype == PdfName.Image: # Image + xoffset = margin[0] + yoffset = margin[1] + cw = width - margin[0] - margin[2] + iw, ih = float(obj.Width), float(obj.Height) + ch = 1.0 * cw / iw * ih + height = ch + margin[1] + margin[3] + p = tuple(user_fmt(x) for x in (cw, ch, xoffset, yoffset)) + contents.stream = fmt % p + resources = PdfDict(XObject=PdfDict(MyImage=obj)) + mbox = PdfArray((0, 0, width, height)) + else: + raise TypeError("Expected Form or Image XObject") + + return PdfDict( + indirect=True, + Type=PdfName.Page, + MediaBox=mbox, + Resources=resources, + Contents=contents, + ) + + +def trivial_xobjs(maxignore=300): + ''' Ignore XObjects that trivially contain other XObjects. + ''' + ignore = set('q Q cm Do'.split()) + Image = PdfName.Image + + def check(obj): + if obj.Subtype == Image: + return False + s = obj.stream + if len(s) < maxignore: + s = (x for x in s.split() if not x.startswith('/') and + x not in ignore) + s = (x.replace('.', '').replace('-', '') for x in s) + if not [x for x in s if not x.isdigit()]: + return True + return check + + +def page_per_xobj(xobj_iter, width=8.5 * 72, margin=0.0 * 72, + image_only=False, ignore=trivial_xobjs(), + wrap_object=wrap_object): + ''' page_per_xobj wraps every XObj found + in its own page object. + width and margin are used to set image sizes. + ''' + try: + iter(margin) + except: + margin = [margin] + while len(margin) < 4: + margin *= 2 + + if isinstance(xobj_iter, (list, dict)): + xobj_iter = find_objects(xobj_iter) + for obj in xobj_iter: + if not ignore(obj): + if not image_only or obj.Subtype == PdfName.IMage: + yield wrap_object(obj, width, margin) diff --git a/pdfrw/objects/__init__.py b/pdfrw/objects/__init__.py new file mode 100644 index 0000000..1746dfe --- /dev/null +++ b/pdfrw/objects/__init__.py @@ -0,0 +1,19 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' +Objects that can occur in PDF files. The most important +objects are arrays and dicts. Either of these can be +indirect or not, and dicts could have an associated +stream. +''' +from .pdfname import PdfName +from .pdfdict import PdfDict, IndirectPdfDict +from .pdfarray import PdfArray +from .pdfobject import PdfObject +from .pdfstring import PdfString +from .pdfindirect import PdfIndirect + +__all__ = [PdfName, PdfDict, IndirectPdfDict, PdfArray, + PdfObject, PdfString, PdfIndirect] diff --git a/pdfrw/objects/pdfarray.py b/pdfrw/objects/pdfarray.py new file mode 100644 index 0000000..b662755 --- /dev/null +++ b/pdfrw/objects/pdfarray.py @@ -0,0 +1,67 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +from .pdfindirect import PdfIndirect +from .pdfobject import PdfObject + + +def _resolved(): + pass + + +class PdfArray(list): + ''' A PdfArray maps the PDF file array object into a Python list. + It has an indirect attribute which defaults to False. + ''' + indirect = False + + def __init__(self, source=[]): + self._resolve = self._resolver + self.extend(source) + + def _resolver(self, isinstance=isinstance, enumerate=enumerate, + listiter=list.__iter__, PdfIndirect=PdfIndirect, + resolved=_resolved, PdfNull=PdfObject('null')): + for index, value in enumerate(list.__iter__(self)): + if isinstance(value, PdfIndirect): + value = value.real_value() + if value is None: + value = PdfNull + self[index] = value + self._resolve = resolved + + def __getitem__(self, index, listget=list.__getitem__): + self._resolve() + return listget(self, index) + + try: + def __getslice__(self, i, j, listget=list.__getslice__): + self._resolve() + return listget(self, i, j) + except AttributeError: + pass + + def __iter__(self, listiter=list.__iter__): + self._resolve() + return listiter(self) + + def count(self, item): + self._resolve() + return list.count(self, item) + + def index(self, item): + self._resolve() + return list.index(self, item) + + def remove(self, item): + self._resolve() + return list.remove(self, item) + + def sort(self, *args, **kw): + self._resolve() + return list.sort(self, *args, **kw) + + def pop(self, *args): + self._resolve() + return list.pop(self, *args) diff --git a/pdfrw/objects/pdfdict.py b/pdfrw/objects/pdfdict.py new file mode 100644 index 0000000..fc28492 --- /dev/null +++ b/pdfrw/objects/pdfdict.py @@ -0,0 +1,233 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +from .pdfname import PdfName, BasePdfName +from .pdfindirect import PdfIndirect +from .pdfobject import PdfObject +from ..py23_diffs import iteritems +from ..errors import PdfParseError + + +class _DictSearch(object): + ''' Used to search for inheritable attributes. + ''' + + def __init__(self, basedict): + self.basedict = basedict + + def __getattr__(self, name, PdfName=PdfName): + return self[PdfName(name)] + + def __getitem__(self, name, set=set, getattr=getattr, id=id): + visited = set() + mydict = self.basedict + while 1: + value = mydict[name] + if value is not None: + return value + myid = id(mydict) + assert myid not in visited + visited.add(myid) + mydict = mydict.Parent + if mydict is None: + return + + +class _Private(object): + ''' Used to store private attributes (not output to PDF files) + on PdfDict classes + ''' + + def __init__(self, pdfdict): + vars(self)['pdfdict'] = pdfdict + + def __setattr__(self, name, value): + vars(self.pdfdict)[name] = value + + +class PdfDict(dict): + ''' PdfDict objects are subclassed dictionaries + with the following features: + + - Every key in the dictionary starts with "/" + + - A dictionary item can be deleted by assigning it to None + + - Keys that (after the initial "/") conform to Python + naming conventions can also be accessed (set and retrieved) + as attributes of the dictionary. E.g. mydict.Page is the + same thing as mydict['/Page'] + + - Private attributes (not in the PDF space) can be set + on the dictionary object attribute dictionary by using + the private attribute: + + mydict.private.foo = 3 + mydict.foo = 5 + x = mydict.foo # x will now contain 3 + y = mydict['/foo'] # y will now contain 5 + + Most standard adobe dictionary keys start with an upper case letter, + so to avoid conflicts, it is best to start private attributes with + lower case letters. + + - PdfDicts have the following read-only properties: + + - private -- as discussed above, provides write access to + dictionary's attributes + - inheritable -- this creates and returns a "view" attribute + that will search through the object hierarchy for + any desired attribute, such as /Rotate or /MediaBox + + - PdfDicts also have the following special attributes: + - indirect is not stored in the PDF dictionary, but in the object's + attribute dictionary + - stream is also stored in the object's attribute dictionary + and will also update the stream length. + - _stream will store in the object's attribute dictionary without + updating the stream length. + + It is possible, for example, to have a PDF name such as "/indirect" + or "/stream", but you cannot access such a name as an attribute: + + mydict.indirect -- accesses object's attribute dictionary + mydict["/indirect"] -- accesses actual PDF dictionary + ''' + indirect = False + stream = None + + _special = dict(indirect=('indirect', False), + stream=('stream', True), + _stream=('stream', False), + ) + + def __setitem__(self, name, value, setter=dict.__setitem__, + BasePdfName=BasePdfName, isinstance=isinstance): + if not isinstance(name, BasePdfName): + raise PdfParseError('Dict key %s is not a PdfName' % repr(name)) + if value is not None: + setter(self, name, value) + elif name in self: + del self[name] + + def __init__(self, *args, **kw): + if args: + if len(args) == 1: + args = args[0] + self.update(args) + if isinstance(args, PdfDict): + self.indirect = args.indirect + self._stream = args.stream + for key, value in iteritems(kw): + setattr(self, key, value) + + def __getattr__(self, name, PdfName=PdfName): + ''' If the attribute doesn't exist on the dictionary object, + try to slap a '/' in front of it and get it out + of the actual dictionary itself. + ''' + return self.get(PdfName(name)) + + def get(self, key, dictget=dict.get, isinstance=isinstance, + PdfIndirect=PdfIndirect): + ''' Get a value out of the dictionary, + after resolving any indirect objects. + ''' + value = dictget(self, key) + if isinstance(value, PdfIndirect): + self[key] = value = value.real_value() + return value + + def __getitem__(self, key): + return self.get(key) + + def __setattr__(self, name, value, special=_special.get, + PdfName=PdfName, vars=vars): + ''' Set an attribute on the dictionary. Handle the keywords + indirect, stream, and _stream specially (for content objects) + ''' + info = special(name) + if info is None: + self[PdfName(name)] = value + else: + name, setlen = info + vars(self)[name] = value + if setlen: + notnone = value is not None + self.Length = notnone and PdfObject(len(value)) or None + + def iteritems(self, dictiter=iteritems, + isinstance=isinstance, PdfIndirect=PdfIndirect, + BasePdfName=BasePdfName): + ''' Iterate over the dictionary, resolving any unresolved objects + ''' + for key, value in list(dictiter(self)): + if isinstance(value, PdfIndirect): + self[key] = value = value.real_value() + if value is not None: + if not isinstance(key, BasePdfName): + raise PdfParseError('Dict key %s is not a PdfName' % + repr(key)) + yield key, value + + def items(self): + return list(self.iteritems()) + + def itervalues(self): + for key, value in self.iteritems(): + yield value + + def values(self): + return list((value for key, value in self.iteritems())) + + def keys(self): + return list((key for key, value in self.iteritems())) + + def __iter__(self): + for key, value in self.iteritems(): + yield key + + def iterkeys(self): + return iter(self) + + def copy(self): + return type(self)(self) + + def pop(self, key): + value = self.get(key) + del self[key] + return value + + def popitem(self): + key, value = dict.pop(self) + if isinstance(value, PdfIndirect): + value = value.real_value() + return value + + def inheritable(self): + ''' Search through ancestors as needed for inheritable + dictionary items. + NOTE: You might think it would be a good idea + to cache this class, but then you'd have to worry + about it pointing to the wrong dictionary if you + made a copy of the object... + ''' + return _DictSearch(self) + inheritable = property(inheritable) + + def private(self): + ''' Allows setting private metadata for use in + processing (not sent to PDF file). + See note on inheritable + ''' + return _Private(self) + private = property(private) + + +class IndirectPdfDict(PdfDict): + ''' IndirectPdfDict is a convenience class. You could + create a direct PdfDict and then set indirect = True on it, + or you could just create an IndirectPdfDict. + ''' + indirect = True diff --git a/pdfrw/objects/pdfindirect.py b/pdfrw/objects/pdfindirect.py new file mode 100644 index 0000000..4df8ac3 --- /dev/null +++ b/pdfrw/objects/pdfindirect.py @@ -0,0 +1,22 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + + +class _NotLoaded(object): + pass + + +class PdfIndirect(tuple): + ''' A placeholder for an object that hasn't been read in yet. + The object itself is the (object number, generation number) tuple. + The attributes include information about where the object is + referenced from and the file object to retrieve the real object from. + ''' + value = _NotLoaded + + def real_value(self, NotLoaded=_NotLoaded): + value = self.value + if value is NotLoaded: + value = self.value = self._loader(self) + return value diff --git a/pdfrw/objects/pdfname.py b/pdfrw/objects/pdfname.py new file mode 100644 index 0000000..1fdf5b5 --- /dev/null +++ b/pdfrw/objects/pdfname.py @@ -0,0 +1,80 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +import re + +from ..errors import log + +warn = log.warning + + +class BasePdfName(str): + ''' A PdfName is an identifier that starts with + a slash. + + If a PdfName has illegal space or delimiter characters, + then it will be decorated with an "encoded" attribute that + has those characters properly escaped as # + + The "encoded" attribute is what is sent out to a PDF file, + the non-encoded main object is what is compared for equality + in a PDF dictionary. + ''' + + indirect = False + + whitespace = '\x00 \t\f\r\n' + delimiters = '()<>{}[]/%' + forbidden = list(whitespace) + list('\\' + x for x in delimiters) + remap = dict((x, '#%02X' % ord(x)) for x in (whitespace + delimiters)) + split_to_encode = re.compile('(%s)' % '|'.join(forbidden)).split + split_to_decode = re.compile(r'\#([0-9A-Fa-f]{2})').split + + def __new__(cls, name, pre_encoded=True, remap=remap, + join=''.join, new=str.__new__, chr=chr, int=int, + split_to_encode=split_to_encode, + split_to_decode=split_to_decode, + ): + ''' We can build a PdfName from scratch, or from + a pre-encoded name (e.g. coming in from a file). + ''' + # Optimization for normal case + if name[1:].isalnum(): + return new(cls, name) + encoded = name + if pre_encoded: + if '#' in name: + substrs = split_to_decode(name) + substrs[1::2] = (chr(int(x, 16)) for x in substrs[1::2]) + name = join(substrs) + else: + encoded = split_to_encode(encoded) + encoded[3::2] = (remap[x] for x in encoded[3::2]) + encoded = join(encoded) + self = new(cls, name) + if encoded != name: + self.encoded = encoded + return self + + +# We could have used a metaclass, but this matches what +# we were doing historically. + +class PdfName(object): + ''' Two simple ways to get a PDF name from a string: + + x = PdfName.FooBar + x = pdfName('FooBar') + + Either technique will return "/FooBar" + + ''' + + def __getattr__(self, name, BasePdfName=BasePdfName): + return BasePdfName('/' + name, False) + + def __call__(self, name, BasePdfName=BasePdfName): + return BasePdfName('/' + name, False) + +PdfName = PdfName() diff --git a/pdfrw/objects/pdfobject.py b/pdfrw/objects/pdfobject.py new file mode 100644 index 0000000..7317395 --- /dev/null +++ b/pdfrw/objects/pdfobject.py @@ -0,0 +1,11 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + + +class PdfObject(str): + ''' A PdfObject is a textual representation of any PDF file object + other than an array, dict or string. It has an indirect attribute + which defaults to False. + ''' + indirect = False diff --git a/pdfrw/objects/pdfstring.py b/pdfrw/objects/pdfstring.py new file mode 100644 index 0000000..5c35d70 --- /dev/null +++ b/pdfrw/objects/pdfstring.py @@ -0,0 +1,74 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +import re + + +class PdfString(str): + ''' A PdfString is an encoded string. It has a decode + method to get the actual string data out, and there + is an encode class method to create such a string. + Like any PDF object, it could be indirect, but it + defaults to being a direct object. + ''' + indirect = False + unescape_dict = {'\\b': '\b', '\\f': '\f', '\\n': '\n', + '\\r': '\r', '\\t': '\t', + '\\\r\n': '', '\\\r': '', '\\\n': '', + '\\\\': '\\', '\\': '', + } + unescape_pattern = (r'(\\\\|\\b|\\f|\\n|\\r|\\t' + r'|\\\r\n|\\\r|\\\n|\\[0-9]+|\\)') + unescape_func = re.compile(unescape_pattern).split + + hex_pattern = '([a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])' + hex_func = re.compile(hex_pattern).split + + hex_pattern2 = ('([a-fA-F0-9][a-fA-F0-9][a-fA-F0-9][a-fA-F0-9]|' + '[a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])') + hex_func2 = re.compile(hex_pattern2).split + + hex_funcs = hex_func, hex_func2 + + def decode_regular(self, remap=chr): + assert self[0] == '(' and self[-1] == ')' + mylist = self.unescape_func(self[1:-1]) + result = [] + unescape = self.unescape_dict.get + for chunk in mylist: + chunk = unescape(chunk, chunk) + if chunk.startswith('\\') and len(chunk) > 1: + value = int(chunk[1:], 8) + # FIXME: TODO: Handle unicode here + if value > 127: + value = 127 + chunk = remap(value) + if chunk: + result.append(chunk) + return ''.join(result) + + def decode_hex(self, remap=chr, twobytes=False): + data = ''.join(self.split()) + data = self.hex_funcs[twobytes](data) + chars = data[1::2] + other = data[0::2] + assert (other[0] == '<' and + other[-1] == '>' and + ''.join(other) == '<>'), self + return ''.join([remap(int(x, 16)) for x in chars]) + + def decode(self, remap=chr, twobytes=False): + if self.startswith('('): + return self.decode_regular(remap) + + else: + return self.decode_hex(remap, twobytes) + + def encode(cls, source, usehex=False): + assert not usehex, "Not supported yet" + source = source.replace('\\', '\\\\') + source = source.replace('(', '\\(') + source = source.replace(')', '\\)') + return cls('(' + source + ')') + encode = classmethod(encode) diff --git a/pdfrw/pagemerge.py b/pdfrw/pagemerge.py new file mode 100644 index 0000000..de1d69b --- /dev/null +++ b/pdfrw/pagemerge.py @@ -0,0 +1,242 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' +This module contains code to edit pages. Sort of a canvas, I +suppose, but I wouldn't want to call it that and get people all +excited or anything. + +No, this is just for doing basic things like merging/splitting +apart pages, watermarking, etc. All it does is allow converting +pages (or parts of pages) into Form XObject rectangles, and then +plopping those down on new or pre-existing pages. +''' + +from .objects import PdfDict, PdfArray, PdfName +from .buildxobj import pagexobj, ViewInfo + +NullInfo = ViewInfo() + + +class RectXObj(PdfDict): + ''' This class facilitates doing positioning (moving and scaling) + of Form XObjects within their containing page, by modifying + the Form XObject's transformation matrix. + + By default, this class keeps the aspect ratio locked. For + example, if your object is foo, you can write 'foo.w = 200', + and it will scale in both the x and y directions. + + To unlock the aspect ration, you have to do a tiny bit of math + and call the scale function. + ''' + def __init__(self, page, viewinfo=NullInfo, **kw): + ''' The page is a page returned by PdfReader. It will be + turned into a cached Form XObject (so that multiple + rectangles can be extracted from it if desired), and then + another Form XObject will be built using it and the viewinfo + (which should be a ViewInfo class). The viewinfo includes + source coordinates (from the top/left) and rotation information. + + Once the object has been built, its destination coordinates + may be examined and manipulated by using x, y, w, h, and + scale. The destination coordinates are in the normal + PDF programmatic system (starting at bottom left). + ''' + if kw: + if viewinfo is not NullInfo: + raise ValueError("Cannot modify preexisting ViewInfo") + viewinfo = ViewInfo(**kw) + viewinfo.cacheable = False + base = pagexobj(page, viewinfo) + self.update(base) + self.indirect = True + self.stream = base.stream + private = self.private + private._rect = [base.x, base.y, base.w, base.h] + matrix = self.Matrix + if matrix is None: + matrix = self.Matrix = PdfArray((1, 0, 0, 1, 0, 0)) + private._matrix = matrix # Lookup optimization + # Default to lower-left corner + self.x = 0 + self.y = 0 + + @property + def x(self): + ''' X location (from left) of object in points + ''' + return self._rect[0] + + @property + def y(self): + ''' Y location (from bottom) of object in points + ''' + return self._rect[1] + + @property + def w(self): + ''' Width of object in points + ''' + return self._rect[2] + + @property + def h(self): + ''' Height of object in points + ''' + return self._rect[3] + + def __setattr__(self, name, value, next=PdfDict.__setattr__, + mine=set('x y w h'.split())): + ''' The underlying __setitem__ won't let us use a property + setter, so we have to fake one. + ''' + if name not in mine: + return next(self, name, value) + if name in 'xy': + r_index, m_index = (0, 4) if name == 'x' else (1, 5) + self._rect[r_index], old = value, self._rect[r_index] + self._matrix[m_index] += value - old + else: + index = 2 + (value == 'h') + self.scale(value / self._rect[index]) + + def scale(self, x_scale, y_scale=None): + ''' Current scaling deals properly with things that + have been rotated in 90 degree increments + (via the ViewMerge object given when instantiating). + ''' + if y_scale is None: + y_scale = x_scale + x, y, w, h = rect = self._rect + ao, bo, co, do, eo, fo = matrix = self._matrix + an = ao * x_scale + bn = bo * y_scale + cn = co * x_scale + dn = do * y_scale + en = x + (eo - x) * 1.0 * (an + cn) / (ao + co) + fn = y + (fo - y) * 1.0 * (bn + dn) / (bo + do) + matrix[:] = an, bn, cn, dn, en, fn + rect[:] = x, y, w * x_scale, h * y_scale + + @property + def box(self): + ''' Return the bounding box for the object + ''' + x, y, w, h = self._rect + return PdfArray([x, y, x + w, y + h]) + + +class PageMerge(list): + ''' A PageMerge object can have 0 or 1 underlying pages + (that get edited with the results of the merge) + and 0-n RectXObjs that can be applied before or + after the underlying page. + ''' + page = None + mbox = None + cbox = None + resources = None + rotate = None + contents = None + + def __init__(self, page=None): + if page is not None: + self.setpage(page) + + def setpage(self, page): + if page.Type != PdfName.Page: + raise TypeError("Expected page") + self.append(None) # Placeholder + self.page = page + inheritable = page.inheritable + self.mbox = inheritable.MediaBox + self.cbox = inheritable.CropBox + self.resources = inheritable.Resources + self.rotate = inheritable.Rotate + self.contents = page.Contents + + def __add__(self, other): + if isinstance(other, dict): + other = [other] + for other in other: + self.add(other) + return self + + def add(self, obj, prepend=False, **kw): + if kw: + obj = RectXObj(obj, **kw) + elif obj.Type == PdfName.Page: + obj = RectXObj(obj) + if prepend: + self.insert(0, obj) + else: + self.append(obj) + return self + + def render(self): + def do_xobjs(xobj_list): + content = [] + for obj in xobj_list: + index = PdfName('pdfrw_%d' % (key_offset + len(xobjs))) + if xobjs.setdefault(index, obj) is not obj: + raise KeyError("XObj key %s already in use" % index) + content.append('%s Do' % index) + return PdfDict(indirect=True, stream='\n'.join(content)) + + mbox = self.mbox + cbox = self.cbox + page = self.page + old_contents = self.contents + resources = self.resources or PdfDict() + + key_offset = 0 + xobjs = resources.XObject + if xobjs is None: + xobjs = resources.XObject = PdfDict() + else: + allkeys = xobjs.keys() + if allkeys: + keys = (x for x in allkeys if x.startswith('/pdfrw_')) + keys = (x for x in keys if x[6:].isdigit()) + keys = sorted(keys, key=lambda x: int(x[6:])) + key_offset = (int(keys[-1][6:]) + 1) if keys else 0 + key_offset -= len(allkeys) + + if old_contents is None: + new_contents = do_xobjs(self) + else: + isdict = isinstance(old_contents, PdfDict) + old_contents = [old_contents] if isdict else old_contents + new_contents = PdfArray() + index = self.index(None) + if index: + new_contents.append(do_xobjs(self[:index])) + new_contents.extend(old_contents) + index += 1 + if index < len(self): + new_contents.append(do_xobjs(self[index:])) + + if mbox is None: + cbox = None + mbox = self.xobj_box + mbox[0] = min(0, mbox[0]) + mbox[1] = min(0, mbox[1]) + + page = PdfDict(indirect=True) if page is None else page + page.Type = PdfName.Page + page.Resources = resources + page.MediaBox = mbox + page.CropBox = cbox + page.Rotate = self.rotate + page.Contents = new_contents + return page + + @property + def xobj_box(self): + ''' Return the smallest box that encloses every object + in the list. + ''' + a, b, c, d = zip(*(xobj.box for xobj in self)) + return PdfArray((min(a), min(b), max(c), max(d))) diff --git a/pdfrw/pdfreader.py b/pdfrw/pdfreader.py new file mode 100644 index 0000000..0baf0eb --- /dev/null +++ b/pdfrw/pdfreader.py @@ -0,0 +1,591 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# Copyright (C) 2012-2015 Nerijus Mika +# MIT license -- See LICENSE.txt for details + +''' +The PdfReader class reads an entire PDF file into memory and +parses the top-level container objects. (It does not parse +into streams.) The object subclasses PdfDict, and the +document pages are stored in a list in the pages attribute +of the object. +''' +import gc +import binascii +import collections +import itertools + +from .errors import PdfParseError, log +from .tokens import PdfTokens +from .objects import PdfDict, PdfArray, PdfName, PdfObject, PdfIndirect +from .uncompress import uncompress +from .py23_diffs import convert_load, iteritems + + +class PdfReader(PdfDict): + + def findindirect(self, objnum, gennum, PdfIndirect=PdfIndirect, int=int): + ''' Return a previously loaded indirect object, or create + a placeholder for it. + ''' + key = int(objnum), int(gennum) + result = self.indirect_objects.get(key) + if result is None: + self.indirect_objects[key] = result = PdfIndirect(key) + self.deferred_objects.add(key) + result._loader = self.loadindirect + return result + + def readarray(self, source, PdfArray=PdfArray): + ''' Found a [ token. Parse the tokens after that. + ''' + specialget = self.special.get + result = [] + pop = result.pop + append = result.append + + for value in source: + if value in ']R': + if value == ']': + break + generation = pop() + value = self.findindirect(pop(), generation) + else: + func = specialget(value) + if func is not None: + value = func(source) + append(value) + return PdfArray(result) + + def readdict(self, source, PdfDict=PdfDict): + ''' Found a << token. Parse the tokens after that. + ''' + specialget = self.special.get + result = PdfDict() + next = source.next + + tok = next() + while tok != '>>': + if not tok.startswith('/'): + source.error('Expected PDF /name object') + tok = next() + continue + key = tok + value = next() + func = specialget(value) + if func is not None: + value = func(source) + tok = next() + else: + tok = next() + if value.isdigit() and tok.isdigit(): + tok2 = next() + if tok2 != 'R': + source.error('Expected "R" following two integers') + tok = tok2 + continue + value = self.findindirect(value, tok) + tok = next() + result[key] = value + return result + + def empty_obj(self, source, PdfObject=PdfObject): + ''' Some silly git put an empty object in the + file. Back up so the caller sees the endobj. + ''' + source.floc = source.tokstart + + def badtoken(self, source): + ''' Didn't see that coming. + ''' + source.exception('Unexpected delimiter') + + def findstream(self, obj, tok, source, len=len): + ''' Figure out if there is a content stream + following an object, and return the start + pointer to the content stream if so. + + (We can't read it yet, because we might not + know how long it is, because Length might + be an indirect object.) + ''' + + fdata = source.fdata + startstream = source.tokstart + len(tok) + gotcr = fdata[startstream] == '\r' + startstream += gotcr + gotlf = fdata[startstream] == '\n' + startstream += gotlf + if not gotlf: + if not gotcr: + source.error(r'stream keyword not followed by \n') + else: + source.warning(r"stream keyword terminated " + r"by \r without \n") + return startstream + + def readstream(self, obj, startstream, source, exact_required=False, + streamending='endstream endobj'.split(), int=int): + fdata = source.fdata + length = int(obj.Length) + source.floc = target_endstream = startstream + length + endit = source.multiple(2) + obj._stream = fdata[startstream:target_endstream] + if endit == streamending: + return + + if exact_required: + source.exception('Expected endstream endobj') + + # The length attribute does not match the distance between the + # stream and endstream keywords. + + # TODO: Extract maxstream from dictionary of object offsets + # and use rfind instead of find. + maxstream = len(fdata) - 20 + endstream = fdata.find('endstream', startstream, maxstream) + source.floc = startstream + room = endstream - startstream + if endstream < 0: + source.error('Could not find endstream') + return + if (length == room + 1 and + fdata[startstream - 2:startstream] == '\r\n'): + source.warning(r"stream keyword terminated by \r without \n") + obj._stream = fdata[startstream - 1:target_endstream - 1] + return + source.floc = endstream + if length > room: + source.error('stream /Length attribute (%d) appears to ' + 'be too big (size %d) -- adjusting', + length, room) + obj.stream = fdata[startstream:endstream] + return + if fdata[target_endstream:endstream].rstrip(): + source.error('stream /Length attribute (%d) appears to ' + 'be too small (size %d) -- adjusting', + length, room) + obj.stream = fdata[startstream:endstream] + return + endobj = fdata.find('endobj', endstream, maxstream) + if endobj < 0: + source.error('Could not find endobj after endstream') + return + if fdata[endstream:endobj].rstrip() != 'endstream': + source.error('Unexpected data between endstream and endobj') + return + source.error('Illegal endstream/endobj combination') + + def loadindirect(self, key, PdfDict=PdfDict, + isinstance=isinstance): + result = self.indirect_objects.get(key) + if not isinstance(result, PdfIndirect): + return result + source = self.source + offset = int(self.source.obj_offsets.get(key, '0')) + if not offset: + source.warning("Did not find PDF object %s", key) + return None + + # Read the object header and validate it + objnum, gennum = key + source.floc = offset + objid = source.multiple(3) + ok = len(objid) == 3 + ok = ok and objid[0].isdigit() and int(objid[0]) == objnum + ok = ok and objid[1].isdigit() and int(objid[1]) == gennum + ok = ok and objid[2] == 'obj' + if not ok: + source.floc = offset + source.next() + objheader = '%d %d obj' % (objnum, gennum) + fdata = source.fdata + offset2 = (fdata.find('\n' + objheader) + 1 or + fdata.find('\r' + objheader) + 1) + if (not offset2 or + fdata.find(fdata[offset2 - 1] + objheader, offset2) > 0): + source.warning("Expected indirect object '%s'", objheader) + return None + source.warning("Indirect object %s found at incorrect " + "offset %d (expected offset %d)", + objheader, offset2, offset) + source.floc = offset2 + len(objheader) + + # Read the object, and call special code if it starts + # an array or dictionary + obj = source.next() + func = self.special.get(obj) + if func is not None: + obj = func(source) + + self.indirect_objects[key] = obj + self.deferred_objects.remove(key) + + # Mark the object as indirect, and + # just return it if it is a simple object. + obj.indirect = key + tok = source.next() + if tok == 'endobj': + return obj + + # Should be a stream. Either that or it's broken. + isdict = isinstance(obj, PdfDict) + if isdict and tok == 'stream': + self.readstream(obj, self.findstream(obj, tok, source), source) + return obj + + # Houston, we have a problem, but let's see if it + # is easily fixable. Leaving out a space before endobj + # is apparently an easy mistake to make on generation + # (Because it won't be noticed unless you are specifically + # generating an indirect object that doesn't end with any + # sort of delimiter.) It is so common that things like + # okular just handle it. + + if isinstance(obj, PdfObject) and obj.endswith('endobj'): + source.error('No space or delimiter before endobj') + obj = PdfObject(obj[:-6]) + else: + source.error("Expected 'endobj'%s token", + isdict and " or 'stream'" or '') + obj = PdfObject('') + + obj.indirect = key + self.indirect_objects[key] = obj + return obj + + def read_all(self): + deferred = self.deferred_objects + prev = set() + while 1: + new = deferred - prev + if not new: + break + prev |= deferred + for key in new: + self.loadindirect(key) + + def uncompress(self): + self.read_all() + uncompress(self.indirect_objects.values()) + + def load_stream_objects(self, object_streams): + # read object streams + objs = [] + for num in object_streams: + obj = self.findindirect(num, 0).real_value() + assert obj.Type == '/ObjStm' + objs.append(obj) + + # read objects from stream + if objs: + uncompress(objs) + for obj in objs: + objsource = PdfTokens(obj.stream, 0, False) + snext = objsource.next + offsets = {} + firstoffset = int(obj.First) + num = snext() + while num.isdigit(): + offset = int(snext()) + offsets[int(num)] = firstoffset + offset + num = snext() + for num, offset in iteritems(offsets): + # Read the object, and call special code if it starts + # an array or dictionary + objsource.floc = offset + sobj = snext() + func = self.special.get(sobj) + if func is not None: + sobj = func(objsource) + + key = (num, 0) + self.indirect_objects[key] = sobj + if key in self.deferred_objects: + self.deferred_objects.remove(key) + + # Mark the object as indirect, and + # add it to the list of streams if it starts a stream + sobj.indirect = key + + def findxref(self, fdata): + ''' Find the cross reference section at the end of a file + ''' + startloc = fdata.rfind('startxref') + if startloc < 0: + raise PdfParseError('Did not find "startxref" at end of file') + source = PdfTokens(fdata, startloc, False, self.verbose) + tok = source.next() + assert tok == 'startxref' # (We just checked this...) + tableloc = source.next_default() + if not tableloc.isdigit(): + source.exception('Expected table location') + if source.next_default().rstrip().lstrip('%') != 'EOF': + source.exception('Expected %%EOF') + return startloc, PdfTokens(fdata, int(tableloc), True, self.verbose) + + def parse_xref_stream(self, source, int=int, range=range, + enumerate=enumerate, islice=itertools.islice, + defaultdict=collections.defaultdict, + hexlify=binascii.hexlify): + ''' Parse (one of) the cross-reference file section(s) + ''' + + def readint(s, lengths): + lengths = itertools.cycle(lengths) + offset = 0 + for length in itertools.cycle(lengths): + next = offset + length + yield int(hexlify(s[offset:next]), 16) if length else None + offset = next + + setdefault = source.obj_offsets.setdefault + next = source.next + # check for xref stream object + objid = source.multiple(3) + ok = len(objid) == 3 + ok = ok and objid[0].isdigit() + ok = ok and objid[1] == 'obj' + ok = ok and objid[2] == '<<' + if not ok: + source.exception('Expected xref stream start') + obj = self.readdict(source) + if obj.Type != PdfName.XRef: + source.exception('Expected dict type of /XRef') + tok = next() + self.readstream(obj, self.findstream(obj, tok, source), source, True) + if not uncompress([obj], True): + source.exception('Could not decompress Xref stream') + num_pairs = obj.Index or PdfArray(['0', obj.Size]) + num_pairs = [int(x) for x in num_pairs] + num_pairs = zip(num_pairs[0::2], num_pairs[1::2]) + entry_sizes = [int(x) for x in obj.W] + if len(entry_sizes) != 3: + source.exception('Invalid entry size') + object_streams = defaultdict(list) + get = readint(obj.stream, entry_sizes) + for objnum, size in num_pairs: + for cnt in range(size): + xtype, p1, p2 = islice(get, 3) + if xtype in (1, None): + if p1: + setdefault((objnum, p2 or 0), p1) + elif xtype == 2: + object_streams[p1].append((objnum, p2)) + objnum += 1 + + obj.private.object_streams = object_streams + return obj + + def parse_xref_table(self, source, int=int, range=range): + ''' Parse (one of) the cross-reference file section(s) + ''' + setdefault = source.obj_offsets.setdefault + next = source.next + # plain xref table + start = source.floc + try: + while 1: + tok = next() + if tok == 'trailer': + return + startobj = int(tok) + for objnum in range(startobj, startobj + int(next())): + offset = int(next()) + generation = int(next()) + inuse = next() + if inuse == 'n': + if offset != 0: + setdefault((objnum, generation), offset) + elif inuse != 'f': + raise ValueError + except: + pass + try: + # Table formatted incorrectly. + # See if we can figure it out anyway. + end = source.fdata.rindex('trailer', start) + table = source.fdata[start:end].splitlines() + for line in table: + tokens = line.split() + if len(tokens) == 2: + objnum = int(tokens[0]) + elif len(tokens) == 3: + offset, generation, inuse = (int(tokens[0]), + int(tokens[1]), tokens[2]) + if offset != 0 and inuse == 'n': + setdefault((objnum, generation), offset) + objnum += 1 + elif tokens: + log.error('Invalid line in xref table: %s' % + repr(line)) + raise ValueError + log.warning('Badly formatted xref table') + source.floc = end + next() + except: + source.floc = start + source.exception('Invalid table format') + + def parsexref(self, source): + ''' Parse (one of) the cross-reference file section(s) + ''' + next = source.next + tok = next() + if tok.isdigit(): + return self.parse_xref_stream(source), True + elif tok == 'xref': + self.parse_xref_table(source) + tok = next() + if tok != '<<': + source.exception('Expected "<<" starting catalog') + return self.readdict(source), False + else: + source.exception('Expected "xref" keyword or xref stream object') + + def readpages(self, node): + pagename = PdfName.Page + pagesname = PdfName.Pages + catalogname = PdfName.Catalog + typename = PdfName.Type + kidname = PdfName.Kids + + # PDFs can have arbitrarily nested Pages/Page + # dictionary structures. + def readnode(node): + nodetype = node[typename] + if nodetype == pagename: + yield node + elif nodetype == pagesname: + for node in node[kidname]: + for node in readnode(node): + yield node + elif nodetype == catalogname: + for node in readnode(node[pagesname]): + yield node + else: + log.error('Expected /Page or /Pages dictionary, got %s' % + repr(node)) + try: + return list(readnode(node)) + except (AttributeError, TypeError) as s: + log.error('Invalid page tree: %s' % s) + return [] + + def __init__(self, fname=None, fdata=None, decompress=False, + disable_gc=True, verbose=True): + + self.private.verbose = verbose + # Runs a lot faster with GC off. + disable_gc = disable_gc and gc.isenabled() + if disable_gc: + gc.disable() + try: + if fname is not None: + assert fdata is None + # Allow reading preexisting streams like pyPdf + if hasattr(fname, 'read'): + fdata = fname.read() + else: + try: + f = open(fname, 'rb') + fdata = f.read() + f.close() + except IOError: + raise PdfParseError('Could not read PDF file %s' % + fname) + fdata = convert_load(fdata) + assert fdata is not None + if not fdata.startswith('%PDF-'): + startloc = fdata.find('%PDF-') + if startloc >= 0: + log.warning('PDF header not at beginning of file') + else: + lines = fdata.lstrip().splitlines() + if not lines: + raise PdfParseError('Empty PDF file!') + raise PdfParseError('Invalid PDF header: %s' % + repr(lines[0])) + + self.private.version = fdata[5:8] + + endloc = fdata.rfind('%EOF') + if endloc < 0: + raise PdfParseError('EOF mark not found: %s' % + repr(fdata[-20:])) + endloc += 6 + junk = fdata[endloc:] + fdata = fdata[:endloc] + if junk.rstrip('\00').strip(): + log.warning('Extra data at end of file') + + private = self.private + private.indirect_objects = {} + private.deferred_objects = set() + private.special = {'<<': self.readdict, + '[': self.readarray, + 'endobj': self.empty_obj, + } + for tok in r'\ ( ) < > { } ] >> %'.split(): + self.special[tok] = self.badtoken + + startloc, source = self.findxref(fdata) + private.source = source + + # Find all the xref tables/streams, and + # then deal with them backwards. + xref_list = [] + while 1: + source.obj_offsets = {} + trailer, is_stream = self.parsexref(source) + prev = trailer.Prev + if prev is None: + token = source.next() + if token != 'startxref' and not xref_list: + source.warning('Expected "startxref" ' + 'at end of xref table') + break + xref_list.append((source.obj_offsets, trailer, is_stream)) + source.floc = int(prev) + + if is_stream: + self.load_stream_objects(trailer.object_streams) + + while xref_list: + later_offsets, later_trailer, is_stream = xref_list.pop() + source.obj_offsets.update(later_offsets) + if is_stream: + trailer.update(later_trailer) + self.load_stream_objects(later_trailer.object_streams) + else: + trailer = later_trailer + + trailer.Prev = None + + if (trailer.Version and + float(trailer.Version) > float(self.version)): + self.private.version = trailer.Version + + if is_stream: + self.Root = trailer.Root + self.Info = trailer.Info + self.ID = trailer.ID + self.Size = trailer.Size + self.Encrypt = trailer.Encrypt + else: + self.update(trailer) + + # self.read_all_indirect(source) + private.pages = self.readpages(self.Root) + if decompress: + self.uncompress() + + # For compatibility with pyPdf + private.numPages = len(self.pages) + finally: + if disable_gc: + gc.enable() + + # For compatibility with pyPdf + def getPage(self, pagenum): + return self.pages[pagenum] diff --git a/pdfrw/pdfwriter.py b/pdfrw/pdfwriter.py new file mode 100755 index 0000000..644bb30 --- /dev/null +++ b/pdfrw/pdfwriter.py @@ -0,0 +1,341 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' +The PdfWriter class writes an entire PDF file out to disk. + +The writing process is not at all optimized or organized. + +An instance of the PdfWriter class has two methods: + addpage(page) +and + write(fname) + +addpage() assumes that the pages are part of a valid +tree/forest of PDF objects. +''' +import gc + +from .objects import (PdfName, PdfArray, PdfDict, IndirectPdfDict, + PdfObject, PdfString) +from .compress import compress as do_compress +from .errors import PdfOutputError, log +from .py23_diffs import iteritems, convert_store + +NullObject = PdfObject('null') +NullObject.indirect = True +NullObject.Type = 'Null object' + + +def user_fmt(obj, isinstance=isinstance, float=float, str=str, + basestring=str, encode=PdfString.encode): + ''' This function may be replaced by the user for + specialized formatting requirements. + ''' + + if isinstance(obj, basestring): + return encode(obj) + + # PDFs don't handle exponent notation + if isinstance(obj, float): + return ('%.9f' % obj).rstrip('0').rstrip('.') + + return str(obj) + + +def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(), + user_fmt=user_fmt, do_compress=do_compress, + convert_store=convert_store, iteritems=iteritems, + id=id, isinstance=isinstance, getattr=getattr, len=len, + sum=sum, set=set, str=str, hasattr=hasattr, repr=repr, + enumerate=enumerate, list=list, dict=dict, tuple=tuple, + PdfArray=PdfArray, PdfDict=PdfDict, PdfObject=PdfObject): + ''' FormatObjects performs the actual formatting and disk write. + Should be a class, was a class, turned into nested functions + for performace (to reduce attribute lookups). + ''' + + def f_write(s): + f.write(convert_store(s)) + + def add(obj): + ''' Add an object to our list, if it's an indirect + object. Just format it if not. + ''' + # Can't hash dicts, so just hash the object ID + objid = id(obj) + + # Automatically set stream objects to indirect + if isinstance(obj, PdfDict): + indirect = obj.indirect or (obj.stream is not None) + else: + indirect = getattr(obj, 'indirect', False) + + if not indirect: + if objid in visited: + log.warning('Replicating direct %s object, ' + 'should be indirect for optimal file size' % + type(obj)) + obj = type(obj)(obj) + objid = id(obj) + visiting(objid) + result = format_obj(obj) + leaving(objid) + return result + + objnum = indirect_dict_get(objid) + + # If we haven't seen the object yet, we need to + # add it to the indirect object list. + if objnum is None: + swapped = swapobj(objid) + if swapped is not None: + old_id = objid + obj = swapped + objid = id(obj) + objnum = indirect_dict_get(objid) + if objnum is not None: + indirect_dict[old_id] = objnum + return '%s 0 R' % objnum + objnum = len(objlist) + 1 + objlist_append(None) + indirect_dict[objid] = objnum + deferred.append((objnum - 1, obj)) + return '%s 0 R' % objnum + + def format_array(myarray, formatter): + # Format array data into semi-readable ASCII + if sum([len(x) for x in myarray]) <= 70: + return formatter % space_join(myarray) + return format_big(myarray, formatter) + + def format_big(myarray, formatter): + bigarray = [] + count = 1000000 + for x in myarray: + lenx = len(x) + 1 + count += lenx + if count > 71: + subarray = [] + bigarray.append(subarray) + count = lenx + subarray.append(x) + return formatter % lf_join([space_join(x) for x in bigarray]) + + def format_obj(obj): + ''' format PDF object data into semi-readable ASCII. + May mutually recurse with add() -- add() will + return references for indirect objects, and add + the indirect object to the list. + ''' + while 1: + if isinstance(obj, (list, dict, tuple)): + if isinstance(obj, PdfArray): + myarray = [add(x) for x in obj] + return format_array(myarray, '[%s]') + elif isinstance(obj, PdfDict): + if compress and obj.stream: + do_compress([obj]) + pairs = sorted((x, y, getattr(x, 'encoded', x)) + for (x, y) in obj.iteritems()) + myarray = [] + for key, value, encoding in pairs: + myarray.append(encoding) + myarray.append(add(value)) + result = format_array(myarray, '<<%s>>') + stream = obj.stream + if stream is not None: + result = ('%s\nstream\n%s\nendstream' % + (result, stream)) + return result + obj = (PdfArray, PdfDict)[isinstance(obj, dict)](obj) + continue + + # We assume that an object with an indirect + # attribute knows how to represent itself to us. + if hasattr(obj, 'indirect'): + return str(getattr(obj, 'encoded', obj)) + return user_fmt(obj) + + def format_deferred(): + while deferred: + index, obj = deferred.pop() + objlist[index] = format_obj(obj) + + indirect_dict = {} + indirect_dict_get = indirect_dict.get + objlist = [] + objlist_append = objlist.append + visited = set() + visiting = visited.add + leaving = visited.remove + space_join = ' '.join + lf_join = '\n '.join + + deferred = [] + + # Don't reference old catalog or pages objects -- + # swap references to new ones. + swapobj = {PdfName.Catalog: trailer.Root, + PdfName.Pages: trailer.Root.Pages, None: trailer}.get + swapobj = [(objid, swapobj(obj.Type)) + for objid, obj in iteritems(killobj)] + swapobj = dict((objid, obj is None and NullObject or obj) + for objid, obj in swapobj).get + + for objid in killobj: + assert swapobj(objid) is not None + + # The first format of trailer gets all the information, + # but we throw away the actual trailer formatting. + format_obj(trailer) + # Keep formatting until we're done. + # (Used to recurse inside format_obj for this, but + # hit system limit.) + format_deferred() + # Now we know the size, so we update the trailer dict + # and get the formatted data. + trailer.Size = PdfObject(len(objlist) + 1) + trailer = format_obj(trailer) + + # Now we have all the pieces to write out to the file. + # Keep careful track of the counts while we do it so + # we can correctly build the cross-reference. + + header = '%%PDF-%s\n%%\xe2\xe3\xcf\xd3\n' % version + f_write(header) + offset = len(header) + offsets = [(0, 65535, 'f')] + offsets_append = offsets.append + + for i, x in enumerate(objlist): + objstr = '%s 0 obj\n%s\nendobj\n' % (i + 1, x) + offsets_append((offset, 0, 'n')) + offset += len(objstr) + f_write(objstr) + + f_write('xref\n0 %s\n' % len(offsets)) + for x in offsets: + f_write('%010d %05d %s\r\n' % x) + f_write('trailer\n\n%s\nstartxref\n%s\n%%%%EOF\n' % (trailer, offset)) + + +class PdfWriter(object): + + _trailer = None + canonicalize = False + + def __init__(self, version='1.3', compress=False): + self.pagearray = PdfArray() + self.compress = compress + self.version = version + self.killobj = {} + + def addpage(self, page): + self._trailer = None + if page.Type != PdfName.Page: + raise PdfOutputError('Bad /Type: Expected %s, found %s' + % (PdfName.Page, page.Type)) + inheritable = page.inheritable # searches for resources + self.pagearray.append( + IndirectPdfDict( + page, + Resources=inheritable.Resources, + MediaBox=inheritable.MediaBox, + CropBox=inheritable.CropBox, + Rotate=inheritable.Rotate, + ) + ) + + # Add parents in the hierarchy to objects we + # don't want to output + killobj = self.killobj + obj = page.Parent + while obj is not None: + objid = id(obj) + if objid in killobj: + break + killobj[objid] = obj + obj = obj.Parent + return self + + addPage = addpage # for compatibility with pyPdf + + def addpages(self, pagelist): + for page in pagelist: + self.addpage(page) + return self + + def _get_trailer(self): + trailer = self._trailer + if trailer is not None: + return trailer + + if self.canonicalize: + self.make_canonical() + + # Create the basic object structure of the PDF file + trailer = PdfDict( + Root=IndirectPdfDict( + Type=PdfName.Catalog, + Pages=IndirectPdfDict( + Type=PdfName.Pages, + Count=PdfObject(len(self.pagearray)), + Kids=self.pagearray + ) + ) + ) + # Make all the pages point back to the page dictionary and + # ensure they are indirect references + pagedict = trailer.Root.Pages + for page in pagedict.Kids: + page.Parent = pagedict + page.indirect = True + self._trailer = trailer + return trailer + + def _set_trailer(self, trailer): + self._trailer = trailer + + trailer = property(_get_trailer, _set_trailer) + + def write(self, fname, trailer=None, user_fmt=user_fmt, + disable_gc=True): + trailer = trailer or self.trailer + + # Dump the data. We either have a filename or a preexisting + # file object. + preexisting = hasattr(fname, 'write') + f = preexisting and fname or open(fname, 'wb') + if disable_gc: + gc.disable() + + try: + FormatObjects(f, trailer, self.version, self.compress, + self.killobj, user_fmt=user_fmt) + finally: + if not preexisting: + f.close() + if disable_gc: + gc.enable() + + def make_canonical(self): + ''' Canonicalizes a PDF. Assumes everything + is a Pdf object already. + ''' + visited = set() + workitems = list(self.pagearray) + while workitems: + obj = workitems.pop() + objid = id(obj) + if objid in visited: + continue + visited.add(objid) + obj.indirect = False + if isinstance(obj, (PdfArray, PdfDict)): + obj.indirect = True + if isinstance(obj, PdfArray): + workitems += obj + else: + workitems += obj.values() diff --git a/pdfrw/py23_diffs.py b/pdfrw/py23_diffs.py new file mode 100644 index 0000000..a0e0318 --- /dev/null +++ b/pdfrw/py23_diffs.py @@ -0,0 +1,46 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +# Deal with Python2/3 differences + +try: + import zlib +except ImportError: + zlib = None + +try: + unicode = unicode +except NameError: + + def convert_load(s): + return s.decode('Latin-1') + + def convert_store(s): + return s.encode('Latin-1') + + def from_array(a): + return a.tobytes() + +else: + + def convert_load(s): + return s + + def convert_store(s): + return s + + def from_array(a): + return a.tostring() + +nextattr, = (x for x in dir(iter([])) if 'next' in x) + +try: + iteritems = dict.iteritems +except AttributeError: + iteritems = dict.items + +try: + xrange = xrange +except NameError: + xrange = range diff --git a/pdfrw/tokens.py b/pdfrw/tokens.py new file mode 100644 index 0000000..5b061d5 --- /dev/null +++ b/pdfrw/tokens.py @@ -0,0 +1,236 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' +A tokenizer for PDF streams. + +In general, documentation used was "PDF reference", +sixth edition, for PDF version 1.7, dated November 2006. + +''' + +import re +import itertools +from .objects import PdfString, PdfObject +from .objects.pdfname import BasePdfName +from .errors import log, PdfParseError +from .py23_diffs import nextattr + + +def linepos(fdata, loc): + line = fdata.count('\n', 0, loc) + 1 + line += fdata.count('\r', 0, loc) - fdata.count('\r\n', 0, loc) + col = loc - max(fdata.rfind('\n', 0, loc), fdata.rfind('\r', 0, loc)) + return line, col + + +class PdfTokens(object): + + # Table 3.1, page 50 of reference, defines whitespace + eol = '\n\r' + whitespace = '\x00 \t\f' + eol + + # Text on page 50 defines delimiter characters + # Escape the ] + delimiters = r'()<>{}[\]/%' + + # "normal" stuff is all but delimiters or whitespace. + + p_normal = r'(?:[^\\%s%s]+|\\[^%s])+' % (whitespace, delimiters, + whitespace) + + p_comment = r'\%%[^%s]*' % eol + + # This will get the bulk of literal strings. + p_literal_string = r'\((?:[^\\()]+|\\.)*[()]?' + + # This will get more pieces of literal strings + # (Don't ask me why, but it hangs without the trailing ?.) + p_literal_string_extend = r'(?:[^\\()]+|\\.)*[()]?' + + # A hex string. This one's easy. + p_hex_string = r'\<[%s0-9A-Fa-f]*\>' % whitespace + + p_dictdelim = r'\<\<|\>\>' + p_name = r'/[^%s%s]*' % (delimiters, whitespace) + + p_catchall = '[^%s]' % whitespace + + pattern = '|'.join([p_normal, p_name, p_hex_string, p_dictdelim, + p_literal_string, p_comment, p_catchall]) + findtok = re.compile('(%s)[%s]*' % (pattern, whitespace), + re.DOTALL).finditer + findparen = re.compile('(%s)[%s]*' % (p_literal_string_extend, + whitespace), re.DOTALL).finditer + + def _cacheobj(cache, obj, constructor): + ''' This caching relies on the constructors + returning something that will compare as + equal to the original obj. This works + fine with our PDF objects. + ''' + result = cache.get(obj) + if result is None: + result = constructor(obj) + cache[result] = result + return result + + def _gettoks(self, startloc, cacheobj=_cacheobj, + delimiters=delimiters, findtok=findtok, + findparen=findparen, PdfString=PdfString, + PdfObject=PdfObject, BasePdfName=BasePdfName): + ''' Given a source data string and a location inside it, + gettoks generates tokens. Each token is a tuple of the form: + , , + The ending file loc is past any trailing whitespace. + + The main complication here is the literal strings, which + can contain nested parentheses. In order to cope with these + we can discard the current iterator and loop back to the + top to get a fresh one. + + We could use re.search instead of re.finditer, but that's slower. + ''' + fdata = self.fdata + current = self.current = [(startloc, startloc)] + cache = {} + while 1: + for match in findtok(fdata, current[0][1]): + current[0] = tokspan = match.span() + token = match.group(1) + firstch = token[0] + if firstch not in delimiters: + token = cacheobj(cache, token, PdfObject) + elif firstch in '/<(%': + if firstch == '/': + # PDF Name + encoded = token + token = cache.get(encoded) + if token is None: + token = cache[token] = BasePdfName(encoded) + elif firstch == '<': + # << dict delim, or < hex string > + if token[1:2] != '<': + token = cacheobj(cache, token, PdfString) + elif firstch == '(': + # Literal string + # It's probably simple, but maybe not + # Nested parentheses are a bear, and if + # they are present, we exit the for loop + # and get back in with a new starting location. + ends = None # For broken strings + if fdata[match.end(1) - 1] != ')': + nest = 2 + m_start, loc = tokspan + for match in findparen(fdata, loc): + loc = match.end(1) + ending = fdata[loc - 1] == ')' + nest += 1 - ending * 2 + if not nest: + break + if ending and ends is None: + ends = loc, match.end(), nest + token = fdata[m_start:loc] + current[0] = m_start, match.end() + if nest: + # There is one possible recoverable error + # seen in the wild -- some stupid generators + # don't escape (. If this happens, just + # terminate on first unescaped ). The string + # won't be quite right, but that's a science + # fair project for another time. + (self.error, self.exception)[not ends]( + 'Unterminated literal string') + loc, ends, nest = ends + token = fdata[m_start:loc] + ')' * nest + current[0] = m_start, ends + token = cacheobj(cache, token, PdfString) + elif firstch == '%': + # Comment + if self.strip_comments: + continue + else: + self.exception(('Tokenizer logic incorrect -- ' + 'should never get here')) + + yield token + if current[0] is not tokspan: + break + else: + if self.strip_comments: + break + raise StopIteration + + def __init__(self, fdata, startloc=0, strip_comments=True, verbose=True): + self.fdata = fdata + self.strip_comments = strip_comments + self.iterator = iterator = self._gettoks(startloc) + self.msgs_dumped = None if verbose else set() + self.next = getattr(iterator, nextattr) + + def setstart(self, startloc): + ''' Change the starting location. + ''' + current = self.current + if startloc != current[0][1]: + current[0] = startloc, startloc + + def floc(self): + ''' Return the current file position + (where the next token will be retrieved) + ''' + return self.current[0][1] + floc = property(floc, setstart) + + def tokstart(self): + ''' Return the file position of the most + recently retrieved token. + ''' + return self.current[0][0] + tokstart = property(tokstart, setstart) + + def __iter__(self): + return self.iterator + + def multiple(self, count, islice=itertools.islice, list=list): + ''' Retrieve multiple tokens + ''' + return list(islice(self, count)) + + def next_default(self, default='nope'): + for result in self: + return result + return default + + def msg(self, msg, *arg): + dumped = self.msgs_dumped + if dumped is not None: + if msg in dumped: + return + dumped.add(msg) + if arg: + msg %= arg + fdata = self.fdata + begin, end = self.current[0] + line, col = linepos(fdata, begin) + if end > begin: + tok = fdata[begin:end].rstrip() + if len(tok) > 30: + tok = tok[:26] + ' ...' + return ('%s (line=%d, col=%d, token=%s)' % + (msg, line, col, repr(tok))) + return '%s (line=%d, col=%d)' % (msg, line, col) + + def warning(self, *arg): + s = self.msg(*arg) + if s: + log.warning(s) + + def error(self, *arg): + s = self.msg(*arg) + if s: + log.error(s) + + def exception(self, *arg): + raise PdfParseError(self.msg(*arg)) diff --git a/pdfrw/toreportlab.py b/pdfrw/toreportlab.py new file mode 100644 index 0000000..9f77d26 --- /dev/null +++ b/pdfrw/toreportlab.py @@ -0,0 +1,146 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' +Converts pdfrw objects into reportlab objects. + +Designed for and tested with rl 2.3. + +Knows too much about reportlab internals. +What can you do? + +The interface to this function is through the makerl() function. + +Parameters: + canv - a reportlab "canvas" (also accepts a "document") + pdfobj - a pdfrw PDF object + +Returns: + A corresponding reportlab object, or if the + object is a PDF Form XObject, the name to + use with reportlab for the object. + + Will recursively convert all necessary objects. + Be careful when converting a page -- if /Parent is set, + will recursively convert all pages! + +Notes: + 1) Original objects are annotated with a + derived_rl_obj attribute which points to the + reportlab object. This keeps multiple reportlab + objects from being generated for the same pdfobj + via repeated calls to makerl. This is great for + not putting too many objects into the + new PDF, but not so good if you are modifying + objects for different pages. Then you + need to do your own deep copying (of circular + structures). You're on your own. + + 2) ReportLab seems weird about FormXObjects. + They pass around a partial name instead of the + object or a reference to it. So we have to + reach into reportlab and get a number for + a unique name. I guess this is to make it + where you can combine page streams with + impunity, but that's just a guess. + + 3) Updated 1/23/2010 to handle multipass documents + (e.g. with a table of contents). These have + a different doc object on every pass. + +''' + +from reportlab.pdfbase import pdfdoc as rldocmodule +from .objects import PdfDict, PdfArray, PdfName +from .py23_diffs import convert_store + +RLStream = rldocmodule.PDFStream +RLDict = rldocmodule.PDFDictionary +RLArray = rldocmodule.PDFArray + + +def _makedict(rldoc, pdfobj): + rlobj = rldict = RLDict() + if pdfobj.indirect: + rlobj.__RefOnly__ = 1 + rlobj = rldoc.Reference(rlobj) + pdfobj.derived_rl_obj[rldoc] = rlobj, None + + for key, value in pdfobj.iteritems(): + rldict[key[1:]] = makerl_recurse(rldoc, value) + + return rlobj + + +def _makestream(rldoc, pdfobj, xobjtype=PdfName.XObject): + rldict = RLDict() + rlobj = RLStream(rldict, convert_store(pdfobj.stream)) + + if pdfobj.Type == xobjtype: + shortname = 'pdfrw_%s' % (rldoc.objectcounter + 1) + fullname = rldoc.getXObjectName(shortname) + else: + shortname = fullname = None + result = rldoc.Reference(rlobj, fullname) + pdfobj.derived_rl_obj[rldoc] = result, shortname + + for key, value in pdfobj.iteritems(): + rldict[key[1:]] = makerl_recurse(rldoc, value) + + return result + + +def _makearray(rldoc, pdfobj): + rlobj = rlarray = RLArray([]) + if pdfobj.indirect: + rlobj.__RefOnly__ = 1 + rlobj = rldoc.Reference(rlobj) + pdfobj.derived_rl_obj[rldoc] = rlobj, None + + mylist = rlarray.sequence + for value in pdfobj: + mylist.append(makerl_recurse(rldoc, value)) + + return rlobj + + +def _makestr(rldoc, pdfobj): + assert isinstance(pdfobj, (float, int, str)), repr(pdfobj) + # TODO: Add fix for float like in pdfwriter + return str(getattr(pdfobj, 'encoded', pdfobj)) + + +def makerl_recurse(rldoc, pdfobj): + docdict = getattr(pdfobj, 'derived_rl_obj', None) + if docdict is not None: + value = docdict.get(rldoc) + if value is not None: + return value[0] + if isinstance(pdfobj, PdfDict): + if pdfobj.stream is not None: + func = _makestream + else: + func = _makedict + if docdict is None: + pdfobj.private.derived_rl_obj = {} + elif isinstance(pdfobj, PdfArray): + func = _makearray + if docdict is None: + pdfobj.derived_rl_obj = {} + else: + func = _makestr + return func(rldoc, pdfobj) + + +def makerl(canv, pdfobj): + try: + rldoc = canv._doc + except AttributeError: + rldoc = canv + rlobj = makerl_recurse(rldoc, pdfobj) + try: + name = pdfobj.derived_rl_obj[rldoc][1] + except AttributeError: + name = None + return name or rlobj diff --git a/pdfrw/uncompress.py b/pdfrw/uncompress.py new file mode 100644 index 0000000..6780d5d --- /dev/null +++ b/pdfrw/uncompress.py @@ -0,0 +1,106 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# Copyright (C) 2012-2015 Nerijus Mika +# MIT license -- See LICENSE.txt for details +# Copyright (c) 2006, Mathieu Fenniak +# BSD license -- see LICENSE.txt for details +''' +A small subset of decompression filters. Should add more later. + +I believe, after looking at the code, that portions of the flate +PNG predictor were originally transcribed from PyPDF2, which is +probably an excellent source of additional filters. +''' +import array +from .objects import PdfDict, PdfName +from .errors import log +from .py23_diffs import zlib, xrange, from_array, convert_load, convert_store + + +def streamobjects(mylist, isinstance=isinstance, PdfDict=PdfDict): + for obj in mylist: + if isinstance(obj, PdfDict) and obj.stream is not None: + yield obj + +# Hack so we can import if zlib not available +decompressobj = zlib if zlib is None else zlib.decompressobj + + +def uncompress(mylist, leave_raw=False, warnings=set(), + flate=PdfName.FlateDecode, decompress=decompressobj, + isinstance=isinstance, list=list, len=len): + ok = True + for obj in streamobjects(mylist): + ftype = obj.Filter + if ftype is None: + continue + if isinstance(ftype, list) and len(ftype) == 1: + # todo: multiple filters + ftype = ftype[0] + parms = obj.DecodeParms + if ftype != flate: + msg = ('Not decompressing: cannot use filter %s' + ' with parameters %s') % (repr(ftype), repr(parms)) + if msg not in warnings: + warnings.add(msg) + log.warning(msg) + ok = False + else: + dco = decompress() + try: + data = dco.decompress(convert_store(obj.stream)) + except Exception as s: + error = str(s) + else: + error = None + if parms: + predictor = int(parms.Predictor or 1) + if 10 <= predictor <= 15: + data, error = flate_png(data, parms) + elif predictor != 1: + error = ('Unsupported flatedecode predictor %s' % + repr(predictor)) + if error is None: + assert not dco.unconsumed_tail + if dco.unused_data.strip(): + error = ('Unconsumed compression data: %s' % + repr(dco.unused_data[:20])) + if error is None: + obj.Filter = None + obj.stream = data if leave_raw else convert_load(data) + else: + log.error('%s %s' % (error, repr(obj.indirect))) + ok = False + return ok + + +def flate_png(data, parms): + ''' PNG prediction is used to make certain kinds of data + more compressible. Before the compression, each data + byte is either left the same, or is set to be a delta + from the previous byte, or is set to be a delta from + the previous row. This selection is done on a per-row + basis, and is indicated by a compression type byte + prepended to each row of data. + + Within more recent PDF files, it is normal to use + this technique for Xref stream objects, which are + quite regular. + ''' + columns = int(parms.Columns) + data = array.array('B', data) + rowlen = columns + 1 + assert len(data) % rowlen == 0 + rows = xrange(0, len(data), rowlen) + for row_index in rows: + offset = data[row_index] + if offset >= 2: + if offset > 2: + return None, 'Unsupported PNG filter %d' % offset + offset = rowlen if row_index else 0 + if offset: + for index in xrange(row_index + 1, row_index + rowlen): + data[index] = (data[index] + data[index - offset]) % 256 + for row_index in reversed(rows): + data.pop(row_index) + return from_array(data), None diff --git a/releasing.txt b/releasing.txt new file mode 100644 index 0000000..a108184 --- /dev/null +++ b/releasing.txt @@ -0,0 +1,10 @@ +Notes on releasing, which is not yet fully automated: + +1) Update version number both in __init__ and in setup + +2) Use pyroma + +3) https://packaging.python.org/en/latest/distributing.html + +a) python setup.py sdist bdist_wheel +b) twine upload dist/* diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..79bc678 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,5 @@ +[bdist_wheel] +# This flag says that the code is written to work on both Python 2 and Python +# 3. If at all possible, it is good practice to do this. If you cannot, you +# will need to generate wheels for each Python version that you support. +universal=1 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..7d94f95 --- /dev/null +++ b/setup.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +from setuptools import setup +from pdfrw import __version__ as version +from pdfrw.py23_diffs import convert_load + +setup( + name='pdfrw', + version=version, + description='PDF file reader/writer library', + long_description=convert_load(open("README.rst", 'rb').read()), + author='Patrick Maupin', + author_email='pmaupin@gmail.com', + platforms='Independent', + url='https://github.com/pmaupin/pdfrw', + packages=['pdfrw', 'pdfrw.objects'], + license='MIT', + classifiers=[ + 'Development Status :: 4 - Beta', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + + 'Topic :: Multimedia :: Graphics :: Graphics Conversion', + 'Topic :: Software Development :: Libraries', + 'Topic :: Text Processing', + 'Topic :: Printing', + 'Topic :: Utilities', + ], + keywords='pdf vector graphics PDF nup watermark split join merge', +) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..218d892 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# This file intentionally left blank. diff --git a/tests/checkdiffs.py b/tests/checkdiffs.py new file mode 100755 index 0000000..4d11888 --- /dev/null +++ b/tests/checkdiffs.py @@ -0,0 +1,81 @@ +#! /usr/bin/env python2 + +import sys +import os +import subprocess +import hashlib + +import expected +import static_pdfs + +source_pdfs = static_pdfs.pdffiles[0] +source_pdfs = dict((os.path.basename(x), x) for x in source_pdfs) + +result_dir = expected.result_dir + +for subdir in sorted(os.listdir(result_dir)): + dstd = os.path.join(result_dir, subdir) + if not os.path.isdir(dstd): + continue + for pdffile in sorted(os.listdir(dstd)): + testname = '%s/%s' % (subdir, pdffile) + srcf = source_pdfs.get(pdffile) + dstf = os.path.join(dstd, pdffile) + if pdffile not in source_pdfs: + print('\n Skipping %s -- source not found' % testname) + continue + + with open(dstf, 'rb') as f: + data = f.read() + hash = hashlib.md5(data).hexdigest() + skipset = set((hash, 'skip', 'xfail', 'fail', '!' + hash)) + if expected.results[testname] & skipset: + print('\n Skipping %s -- marked done' % testname) + continue + if os.path.exists('foobar.pdf'): + os.remove('foobar.pdf') + builtdiff = False + while 1: + sys.stdout.write(''' + Test case %s + + c = compare using imagemagick and okular + f = display foobar.pdf (result from comparison) + o = display results with okular + a = display results with acrobat + + s = mark 'skip' and go to next PDF + g = mark as good and go to next PDF + b = mark as bad and go to next PDF + n = next pdf without marking + q = quit +--> ''' % testname) + sel = raw_input() + if sel == 'q': + raise SystemExit(0) + if sel == 'n': + break + if sel == 'c': + subprocess.call(('compare', '-verbose', srcf, dstf, + 'foobar.pdf')) + builtdiff = True + continue + if sel == 'f': + subprocess.call(('okular', 'foobar.pdf')) + continue + if sel == 'o': + subprocess.call(('okular', srcf, dstf)) + continue + if sel == 'a': + if builtdiff: + subprocess.call(('acroread', srcf, dstf, 'foobar.pdf')) + else: + subprocess.call(('acroread', srcf, dstf)) + continue + + if sel in 'sgb': + results = (hash if sel == 'g' else + ' skip' if sel == 's' else '!'+hash) + with open(expected.expectedf, 'a') as f: + f.write('%s %s\n' % (testname, results)) + break diff --git a/tests/expected.py b/tests/expected.py new file mode 100644 index 0000000..a65c989 --- /dev/null +++ b/tests/expected.py @@ -0,0 +1,41 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' + Read expected.txt, which should be in the format: + + testname/srcname.pdf validhash + + More than one validhash is allowed (on separate lines), + and hash-delimited comments are allowed. +''' + +import os +import collections +from pdfrw.py23_diffs import convert_load + +root_dir = os.path.dirname(__file__) +result_dir = 'tmp_results' +if os.path.exists('ramdisk'): + result_dir = os.path.join('ramdisk', result_dir) +result_dir = os.path.join(root_dir, result_dir) + +for sourcef in ('mytests.txt', 'expected.txt'): + expectedf = os.path.join(root_dir, sourcef) + if os.path.exists(expectedf): + break + + +def results(): + results = collections.defaultdict(set) + with open(expectedf, 'rb') as f: + for line in f: + line = convert_load(line) + line = line.split('#', 1)[0].split() + if not line: + continue + key, value = line + results[key].add(value) + return results +results = results() diff --git a/tests/expected.txt b/tests/expected.txt new file mode 100644 index 0000000..64eecdd --- /dev/null +++ b/tests/expected.txt @@ -0,0 +1,140 @@ +# Example programs + +examples/4up_b1c400de699af29ea3f1983bb26870ab 1b73c612c40b5082d955ed72f63644bd +examples/alter_b1c400de699af29ea3f1983bb26870ab 3c3ee465f45a685ba7098691be05a5ab +examples/booklet_b1c400de699af29ea3f1983bb26870ab d711b74110eefb4e9e6bf1a5bea16bfe +examples/extract_1975ef8db7355b1d691bc79d0749574b b4f5ee36a288da970ed040a9a733c8b0 +examples/extract_c5c895deecf7a7565393587e0d61be2b 539aad09ef80907bb396c3260eb87d7b +examples/extract_d711b74110eefb4e9e6bf1a5bea16bfe 26ddfd09c6e6002228f06782c8544ac4 +examples/print_two_b1c400de699af29ea3f1983bb26870ab 73c8a16aba44548c2c06dae6e2551961 +examples/subset_b1c400de699af29ea3f1983bb26870ab_1-3_5 880a9578197130273ccb51265af08029 +examples/unspread_d711b74110eefb4e9e6bf1a5bea16bfe 780a9abe26a9de0b5b95ee22c4835e4b + +examples/cat_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c 62bb9b746ff5932d3f1b88942d36a81d +examples/rotate_707e3e2d17cbe9ec2273414b3b63f333_270_1-4_7-8_10-50_52-56 841c980dfadf2cc47ad86e4649ca69b6 +examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c 41989bb2cb6225c6e14262ff5d4f151f +examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c_-u e43e3ac0afe1cc242549424755dbf612 + +# All these are in the poster test +examples/subset_1975ef8db7355b1d691bc79d0749574b_21 5057f345f1a1109a0e54276a68e8f8df +examples/rotate_5057f345f1a1109a0e54276a68e8f8df_90_1 881f4dc8dcf069e707bf61af95492d86 +examples/poster_881f4dc8dcf069e707bf61af95492d86 a34be06d22105b6c02394a9f278fec0d + +examples/rl1/4up_b1c400de699af29ea3f1983bb26870ab 959d6246ad8bda72bd023e8681216d17 +examples/rl1/booklet_b1c400de699af29ea3f1983bb26870ab 45b4ae29a038271896b7264bbed63bdf +examples/rl1/subset_b1c400de699af29ea3f1983bb26870ab_3_5 822bce1cb9e053f1f3f6b922bf27fab8 +examples/rl1/platypus_pdf_template_b1c400de699af29ea3f1983bb26870ab 97ad6a8ca3fe7cc4e1f0ffb8475355e9 + +# List things that need work here (typically cause exceptions) + +# Bad info dict -- works otherwise + +simple/b1c400de699af29ea3f1983bb26870ab.pdf ecf2e28de18a724b53670c0d5637ec28 +repaginate/b1c400de699af29ea3f1983bb26870ab.pdf 4d7d6c5f6e14c6eac1dfc055cebfa499 + +# 07b0ba4 is missing an object. Best we can do is report it +# (and we do) + +repaginate/07b0ba4cff1c6ff73fd468b04b013457.pdf 993c763e085bce7ecc941ba104f4c892 +simple/07b0ba4cff1c6ff73fd468b04b013457.pdf 499b9c1b1e1c76b7c5c0d5e3b62889e3 + +#b107 has a single page, but with an empty contents dict. + +repaginate/b107669d1dd69eabb89765fabb2cb321.pdf 0652d2da25b50cad75863d0e2bbaa878 +simple/b107669d1dd69eabb89765fabb2cb321.pdf 56025c06ab8633575ddc6c6990d2fbf1 + +# Encrypted files + +repaginate/0ae80b493bc21e6de99f2ff6bbb8bc2c.pdf skip +repaginate/6e122f618c27f3aa9a689423e3be6b8d.pdf skip +repaginate/7dc787639aa6765214e9ff5494d231ed.pdf skip +repaginate/b4b27aaa1f9c7c524298e98be279bebb.pdf skip +repaginate/b5b6c6405d7b48418bccf97277957664.pdf skip +repaginate/bd0ef57aec16ded45bd89d61b54af0be.pdf skip +repaginate/dbb807a878ac1da6b91ac15c9de4e209.pdf skip +simple/0ae80b493bc21e6de99f2ff6bbb8bc2c.pdf skip +simple/6e122f618c27f3aa9a689423e3be6b8d.pdf skip +simple/7dc787639aa6765214e9ff5494d231ed.pdf skip +simple/b4b27aaa1f9c7c524298e98be279bebb.pdf skip +simple/b5b6c6405d7b48418bccf97277957664.pdf skip +simple/bd0ef57aec16ded45bd89d61b54af0be.pdf skip +simple/dbb807a878ac1da6b91ac15c9de4e209.pdf skip + + + +# List good hashes for round-trips here. + +repaginate/06c86654f9a77e82f9adaa0086fc391c.pdf 848966fe40a1e3de842f82700dc6d67b +repaginate/08f69084d72dabc5dfdcf5c1ff2a719f.pdf b8c60878b0e0ce81cb6e8777038166b1 +repaginate/09715ec1a7b0f3a7ae02b3046f627b9f.pdf daf7cff9c0a15bbb347489f9fbda25f8 +repaginate/0a61de50b5ee0ea4d5d69c95dab817a3.pdf c6cd38b1131c4b856f60ebfcf51da6f5 +repaginate/1975ef8db7355b1d691bc79d0749574b.pdf 53e5510be27db134edf3cf23873914af +repaginate/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 20dc3be2affe9082564c01b1146d7598 +repaginate/1f5dd128c3757420a881a155f2f8ace3.pdf 019aead1450842406a04c508243e5161 +repaginate/22628a7ed578b622520325673ab2a4f2.pdf 255776a6956918c7b324dede711680ae +repaginate/2ac7c68e26a8ef797aead15e4875cc6d.pdf e7344551183415d6257e2cab2aef4a61 +repaginate/295d26e61a85635433f8e4b768953f60.pdf 13ece51f4d2ad25707982765abbcd789 +repaginate/2fac0d9a189ca5fcef8626153d050be8.pdf 95fe3d9258ace5bdccb95a55c2c8cb22 +repaginate/319c998910453bc44d40c7748cd2cb79.pdf c1a19d1acc3f172711bdbea000cf392e +repaginate/35df0b8cff4afec0c08f08c6a5bc9857.pdf 3568e1c885a461b350c790ec5b729af3 +repaginate/365b9c95574ee8944370fe286905d0e8.pdf 84e5fc0d4f30ff8db05780fd244d9cf0 +repaginate/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e +repaginate/49e31fd074eca6af981d78d42d0078ec.pdf 77fd3fa86c7c0166a373b66cfef357d2 +repaginate/536dfc6fbadd87c03eb59375d091eb53.pdf d0b7467d7bd6c7f73b7764b06c0be1aa +repaginate/569f8094597bbe5b58efc3a7c6e14e87.pdf 6b0ab50c247ca43b70b2b2f27ee2c1a2 +repaginate/5f0cff36d0ad74536a6513a98a755016.pdf b65c2557988db8625c0761bab1d131f1 +repaginate/5f265db2736850782aeaba2571a3c749.pdf 9bb5644ede0ee7cf99642729eda76686 +repaginate/6a42c8c79b807bf164d31071749e07b0.pdf 33a231263e1a4203338b7b1052fc0091 +repaginate/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 93419e831e436d9093a153f35d3441c3 +repaginate/7037a992b80b60f0294016037baa9292.pdf dd41b0104f185206b51e7ffe5b07d261 +repaginate/707e3e2d17cbe9ec2273414b3b63f333.pdf 6c65526ab372d72cb185933e3d2584ef +repaginate/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2 +repaginate/72eb207b8f882618899aa7a65d3cecda.pdf 0b64f19a8a39fadfa2a3eec3f1a01233 +repaginate/97ba0a239cefa0dc727c2f1be050ec6c.pdf a94fe7183ce8979174b2ac16dcd9b1ea +repaginate/9d8626d18b1d8807d271e6ffc409446a.pdf cdfcf8add1af9e612ba1a2ee06a6a273 +repaginate/9f98322c243fe67726d56ccfa8e0885b.pdf 69503ac140a1e4f1322f9350646e3dae +repaginate/c55eb9a13859a7fbddd8af9c16eba3a7.pdf b0d1f3925423f9c3ecf4a47baa949f75 +repaginate/c5c895deecf7a7565393587e0d61be2b.pdf 59e350c6f7d7b89fab36a4019bb526fd +repaginate/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf 3623b7f200818c63cb6838f9678a4840 +repaginate/d6fd9567078b48c86710e9c49173781f.pdf 874b532f61139261f71afb5987dd2a68 +repaginate/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 7d3c3ae13cc7d53e7fa6ef046e15dbaa +repaginate/ec00d5825f47b9d0faa953b1709163c3.pdf 8e6a481476c2b3bdd64ce8e36f8fe273 +repaginate/ed81787b83cc317c9f049643b853bea3.pdf 4636b68f294302417b81aaaadde1c73d + +simple/06c86654f9a77e82f9adaa0086fc391c.pdf 6e2a2e063de895d28dfea9aacb9fe469 +simple/08f69084d72dabc5dfdcf5c1ff2a719f.pdf 5a41601f6033356539e623091a3f79ef +simple/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334 +simple/09715ec1a7b0f3a7ae02b3046f627b9f.pdf c4e4b3b725bd5fc3b008f1ac6251ad1c +simple/1975ef8db7355b1d691bc79d0749574b.pdf 475c28c9588f3a7f6110d30f391758c4 +simple/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 3f17f19fd92adf01998bb13a0ee52b92 +simple/1f5dd128c3757420a881a155f2f8ace3.pdf b0d01f9d6ac156326aeb14b940aa73e7 +simple/22628a7ed578b622520325673ab2a4f2.pdf 1163cec415728899e997a29be465d02d +simple/295d26e61a85635433f8e4b768953f60.pdf fe3b8960c7f877db05c7cd12c9c6e097 +simple/2ac7c68e26a8ef797aead15e4875cc6d.pdf 2623eae06eada9587574f8ddd7fc80fa +simple/2fac0d9a189ca5fcef8626153d050be8.pdf 458501ecda909b00262b9654f0b09ebf +simple/319c998910453bc44d40c7748cd2cb79.pdf 8c84e36ec1db8c1dbfaa312646e000b4 +simple/35df0b8cff4afec0c08f08c6a5bc9857.pdf 0a2926c23ad916c449d5dadcfa9d38ef +simple/365b9c95574ee8944370fe286905d0e8.pdf cf3bfac41f410bf5bd657e3f906dfbc6 +simple/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e +simple/49e31fd074eca6af981d78d42d0078ec.pdf 2c316537a5b0917634cbbdc5b91511df +simple/536dfc6fbadd87c03eb59375d091eb53.pdf 319851765c70ba103c4191f7ec2148db +simple/569f8094597bbe5b58efc3a7c6e14e87.pdf 025f1bf95cc537c36b8c3a044758b86c +simple/5f0cff36d0ad74536a6513a98a755016.pdf 8476fd75e75394fcbbe02816d0640e7d +simple/5f265db2736850782aeaba2571a3c749.pdf d4d2e93ab22e866c86e32da84421f6f9 +simple/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013 +simple/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf fe8dd16dd7fef40338140e0610d0cbbf +simple/7037a992b80b60f0294016037baa9292.pdf 6a2ef24e5f74dd74969ff8cefdfc6a05 +simple/707e3e2d17cbe9ec2273414b3b63f333.pdf 4bdf1e57a96ce42717110b4e55098c1a +simple/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2 +simple/72eb207b8f882618899aa7a65d3cecda.pdf 4ce7ff29531cc417c26389af28dc1c5e +simple/97ba0a239cefa0dc727c2f1be050ec6c.pdf c24873bab85b8ecc7c5433d8d802bceb +simple/9d8626d18b1d8807d271e6ffc409446a.pdf 2358d654bf20d2b9d179ab009a615c4e +simple/9f98322c243fe67726d56ccfa8e0885b.pdf 9290b4c32f005e1e4c7f431955246c4c +simple/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 6b406128e0ed1ac23dc5a0ee34d1f717 +simple/c5c895deecf7a7565393587e0d61be2b.pdf 2cc3c75e56d5dd562ca5b1f994bd9d5c +simple/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf 2083f0e55cf06d88df02956a21bfef23 +simple/d6fd9567078b48c86710e9c49173781f.pdf 77464ec5cfdacb61a73b506bc4945631 +simple/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 5bc96989bc4f4b6438da953443336124 +simple/ec00d5825f47b9d0faa953b1709163c3.pdf 708f66049169c28ac39b0553908dc318 +simple/ed81787b83cc317c9f049643b853bea3.pdf c227d627217dc6808c50e80063734d27 + diff --git a/tests/myprofile.py b/tests/myprofile.py new file mode 100644 index 0000000..af18a64 --- /dev/null +++ b/tests/myprofile.py @@ -0,0 +1,5 @@ +import cProfile +import unittest +import test_roundtrip + +cProfile.run('unittest.main(test_roundtrip)') diff --git a/tests/test_examples.py b/tests/test_examples.py new file mode 100755 index 0000000..baa98a6 --- /dev/null +++ b/tests/test_examples.py @@ -0,0 +1,195 @@ +#! /usr/bin/env python + +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' +Run from the directory above like so: + + python -m tests.test_examples + +A PDF that has been determined to be good or bad +should be added to expected.txt with either a good +checksum, or just the word "fail". + +These tests are incomplete, but they allow us to try +out various PDFs. There is a collection of difficult +PDFs available on github. + +In order to use them: + + 1) Insure that github.com/pmaupin/static_pdfs is on your path. + + 2) Use the imagemagick compare program to look at differences + between the static_pdfs/global directory and the tmp_results + directory after you run this. + + +''' +import sys +import os +import hashlib +import subprocess +import static_pdfs +import expected + +from pdfrw.py23_diffs import convert_store +from pdfrw import PdfReader, PdfWriter + +try: + import unittest2 as unittest +except ImportError: + import unittest + + +prog_dir = os.path.join(expected.root_dir, '..', 'examples', '%s.py') +prog_dir = os.path.abspath(prog_dir) +dstdir = os.path.join(expected.result_dir, 'examples') +hashfile = os.path.join(expected.result_dir, 'hashes.txt') + +lookup = static_pdfs.pdffiles[0] +lookup = dict((os.path.basename(x)[:-4], x) for x in lookup) + + +class TestOnePdf(unittest.TestCase): + + def do_test(self, params, prev_results=[''], scrub=False): + params = params.split() + hashkey = 'examples/%s' % '_'.join(params) + params = [lookup.get(x, x) for x in params] + progname = params[0] + params[0] = prog_dir % progname + srcf = params[1] + params.insert(0, sys.executable) + subdir, progname = os.path.split(progname) + subdir = os.path.join(dstdir, subdir) + if not os.path.exists(subdir): + os.makedirs(subdir) + os.chdir(subdir) + dstf = '%s.%s' % (progname, os.path.basename(srcf)) + scrub = scrub and dstf + dstf = dstf if not scrub else 'final.%s' % dstf + hash = '------no-file-generated---------' + expects = expected.results[hashkey] + + # If the test has been deliberately skipped, + # we are done. Otherwise, execute it even + # if we don't know about it yet, so we have + # results to compare. + + result = 'fail' + size = 0 + try: + if 'skip' in expects: + result = 'skip requested' + return self.skipTest(result) + elif 'xfail' in expects: + result = 'xfail requested' + return self.fail(result) + + exists = os.path.exists(dstf) + if expects or not exists: + if exists: + os.remove(dstf) + if scrub and os.path.exists(scrub): + os.remove(scrub) + subprocess.call(params) + if scrub: + PdfWriter().addpages(PdfReader(scrub).pages).write(dstf) + with open(dstf, 'rb') as f: + data = f.read() + size = len(data) + if data: + hash = hashlib.md5(data).hexdigest() + lookup[hash] = dstf + prev_results[0] = hash + else: + os.remove(dstf) + if expects: + if len(expects) == 1: + expects, = expects + self.assertEqual(hash, expects) + else: + self.assertIn(hash, expects) + result = 'pass' + else: + result = 'skip' + self.skipTest('No hash available') + finally: + result = '%8d %-20s %s %s\n' % (size, result, hashkey, hash) + with open(hashfile, 'ab') as f: + f.write(convert_store(result)) + + def test_4up(self): + self.do_test('4up b1c400de699af29ea3f1983bb26870ab') + + def test_booklet_unspread(self): + prev = [None] + self.do_test('booklet b1c400de699af29ea3f1983bb26870ab', prev) + if prev[0] is not None: + self.do_test('unspread ' + prev[0]) + self.do_test('extract ' + prev[0]) + + def test_print_two(self): + self.do_test('print_two b1c400de699af29ea3f1983bb26870ab') + + def test_watermarks(self): + self.do_test('watermark b1c400de699af29ea3f1983bb26870ab ' + '06c86654f9a77e82f9adaa0086fc391c') + self.do_test('watermark b1c400de699af29ea3f1983bb26870ab ' + '06c86654f9a77e82f9adaa0086fc391c -u') + + def test_subset(self): + self.do_test('subset b1c400de699af29ea3f1983bb26870ab 1-3 5') + + def test_alter(self): + self.do_test('alter b1c400de699af29ea3f1983bb26870ab') + + def test_cat(self): + self.do_test('cat b1c400de699af29ea3f1983bb26870ab ' + '06c86654f9a77e82f9adaa0086fc391c') + + def test_rotate(self): + self.do_test('rotate 707e3e2d17cbe9ec2273414b3b63f333 ' + '270 1-4 7-8 10-50 52-56') + + def test_poster(self): + prev = [None] + self.do_test('subset 1975ef8db7355b1d691bc79d0749574b 21', prev) + self.do_test('rotate %s 90 1' % prev[0], prev) + self.do_test('poster %s' % prev[0], prev) + + def test_extract(self): + self.do_test('extract 1975ef8db7355b1d691bc79d0749574b') + self.do_test('extract c5c895deecf7a7565393587e0d61be2b') + + def test_rl1_4up(self): + if sys.version_info < (2, 7): + return + self.do_test('rl1/4up b1c400de699af29ea3f1983bb26870ab', + scrub=True) + + def test_rl1_booklet(self): + if sys.version_info < (2, 7): + return + self.do_test('rl1/booklet b1c400de699af29ea3f1983bb26870ab', + scrub=True) + + def test_rl1_subset(self): + if sys.version_info < (2, 7): + return + self.do_test('rl1/subset b1c400de699af29ea3f1983bb26870ab 3 5', + scrub=True) + + def test_rl1_platypus(self): + if sys.version_info < (2, 7): + return + self.do_test('rl1/platypus_pdf_template b1c400de699af29ea3f1983bb26870ab', + scrub=True) + +def main(): + unittest.main() + +if __name__ == '__main__': + main() diff --git a/tests/test_pdfstring.py b/tests/test_pdfstring.py new file mode 100644 index 0000000..fce47ef --- /dev/null +++ b/tests/test_pdfstring.py @@ -0,0 +1,39 @@ +#! /usr/bin/env python + +''' +Run from the directory above like so: +python -m tests.test_pdfstring +''' + + +import pdfrw +import unittest + + +class TestEncoding(unittest.TestCase): + + @staticmethod + def decode(value): + return pdfrw.objects.PdfString(value).decode() + + @staticmethod + def encode(value): + return str(pdfrw.objects.PdfString.encode(value)) + + @classmethod + def encode_decode(cls, value): + return cls.decode(cls.encode(value)) + + def roundtrip(self, value): + self.assertEqual(value, self.encode_decode(value)) + + def test_doubleslash(self): + self.roundtrip('\\') + + +def main(): + unittest.main() + + +if __name__ == '__main__': + main() diff --git a/tests/test_roundtrip.py b/tests/test_roundtrip.py new file mode 100755 index 0000000..cb3645e --- /dev/null +++ b/tests/test_roundtrip.py @@ -0,0 +1,132 @@ +#! /usr/bin/env python + +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' +Run from the directory above like so: + + python -m tests.test_roundtrip + +A PDF that has been determined to be good or bad +should be added to expected.txt with either a good +checksum, or just the word "fail". + +These tests are incomplete, but they allow us to try +out various PDFs. There is a collection of difficult +PDFs available on github. + +In order to use them: + + 1) Insure that github.com/pmaupin/static_pdfs is on your path. + + 2) Use the imagemagick compare program to look at differences + between the static_pdfs/global directory and the tmp_results + directory after you run this. + + +''' +import os +import hashlib +import pdfrw +import static_pdfs +import expected + +from pdfrw.py23_diffs import convert_store + +try: + import unittest2 as unittest +except ImportError: + import unittest + + +class TestOnePdf(unittest.TestCase): + + def roundtrip(self, testname, basename, srcf, decompress=False, + compress=False, repaginate=False): + dstd = os.path.join(expected.result_dir, testname) + if not os.path.exists(dstd): + os.makedirs(dstd) + dstf = os.path.join(dstd, basename) + hashfile = os.path.join(expected.result_dir, 'hashes.txt') + hashkey = '%s/%s' % (testname, basename) + hash = '------no-file-generated---------' + expects = expected.results[hashkey] + + # If the test has been deliberately skipped, + # we are done. Otherwise, execute it even + # if we don't know about it yet, so we have + # results to compare. + + result = 'fail' + size = 0 + try: + if 'skip' in expects: + result = 'skip requested' + return self.skipTest(result) + elif 'xfail' in expects: + result = 'xfail requested' + return self.fail(result) + + exists = os.path.exists(dstf) + if expects or not exists: + if exists: + os.remove(dstf) + trailer = pdfrw.PdfReader(srcf, decompress=decompress, + verbose=False) + if trailer.Encrypt: + result = 'skip -- encrypt' + hash = '------skip-encrypt-no-file------' + return self.skipTest('File encrypted') + writer = pdfrw.PdfWriter(compress=compress) + if repaginate: + writer.addpages(trailer.pages) + trailer = None + writer.write(dstf, trailer) + with open(dstf, 'rb') as f: + data = f.read() + size = len(data) + if data: + hash = hashlib.md5(data).hexdigest() + else: + os.remove(dstf) + if expects: + if len(expects) == 1: + expects, = expects + self.assertEqual(hash, expects) + else: + self.assertIn(hash, expects) + result = 'pass' + else: + result = 'skip' + self.skipTest('No hash available') + finally: + result = '%8d %-20s %s %s\n' % (size, result, hashkey, hash) + with open(hashfile, 'ab') as f: + f.write(convert_store(result)) + + +def build_tests(): + def test_closure(*args, **kw): + def test(self): + self.roundtrip(*args, **kw) + return test + for mytest, repaginate in ( + ('simple', False), + ('repaginate', True) + ): + for srcf in static_pdfs.pdffiles[0]: + basename = os.path.basename(srcf) + test_name = 'test_%s_%s' % (mytest, basename) + test = test_closure(mytest, basename, srcf, + repaginate=repaginate) + setattr(TestOnePdf, test_name, test) +build_tests() + + +def main(): + unittest.main() + +if __name__ == '__main__': + main()