Import pdfrw_0.4.orig.tar.gz

[dgit import orig pdfrw_0.4.orig.tar.gz]
2017-09-21 14:55:46 +02:00 · 2017-09-21 14:55:46 +02:00 · 5d56e870e8
commit 5d56e870e8
61 changed files with 6937 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,67 @@
+# OSX
+.DS_Store
+.AppleDouble
+.LSOverride
+Icon
+
+# Thumbnails
+._*
+
+# Files that might appear on external disk
+.Spotlight-V100
+.Trashes
+
+
+# Development artifacts
+diffs.txt
+examples/*.pdf
+examples/rl*/*.pdf
+tests/*.pdf
+examples/pdfrw
+examples/rl*/pdfrw
+tests/pdfrw
+tests/static_pdfs
+tests/ramdisk
+tests/saved_results
+tests/tmp_results
+wiki/
+
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# Distribution / packaging
+.Python
+env/
+bin/
+build/
+develop-eggs/
+dist/
+eggs/
+lib/
+lib64/
+lib64
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+pyvenv.cfg
+pip-selfcheck.json
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+
+# Sphinx documentation
+docs/_build/
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,20 @@
+language: python
+python:
+  - "2.6"
+  - "2.7"
+  - "3.3"
+  - "3.4"
+  - "3.5"
+  - "3.6"
+  - "nightly"
+# command to install dependencies
+before_install:
+  - "git clone https://github.com/pmaupin/static_pdfs tests/static_pdfs"
+install:
+  - "pip install ."
+  - "pip install reportlab || true"
+  - "pip install PyCrypto || true"
+  - "pip install zlib || true"
+  - "pip install unittest2 || true"
+# command to run tests
+script: "cd tests; /usr/bin/env PYTHONPATH=. py.test"
--- a/LICENSE.txt
+++ b/LICENSE.txt
@ -0,0 +1,74 @@
+pdfrw (github.com/pmaupin/pdfrw)
+
+The majority of pdfrw was written by Patrick Maupin and is licensed
+under the MIT license (reproduced below).  Other contributors include
+Attila Tajti and Nerijus Mika.  It appears that some of the decompression
+code was based on the decompressor from PyPDF2, which was written by
+Mathieu Fenniak and licensed under the BSD license (also reproduced below).
+
+Please add any missing authors here:
+
+Copyright (c) 2006-2017  Patrick Maupin. All rights reserved.
+Copyright (c) 2006       Mathieu Fenniak. All rights reserved.
+Copyright (c) 2010       Attila Tajti. All rights reserved.
+Copyright (c) 2012       Nerijus Mika. All rights reserved.
+Copyright (c) 2015       Bastien Gandouet. All rights reserved.
+Copyright (c) 2015       Tzerjen Wei. All rights reserved.
+Copyright (c) 2015       Jorj X. McKie. All rights reserved.
+Copyright (c) 2015       Nicholas Devenish. All rights reserved.
+Copyright (c) 2015-2016  Jonatan Dellagostin. All rights reserved.
+Copyright (c) 2016-2017  Thomas Kluyver. All rights reserved.
+Copyright (c) 2016       James Laird-Wah. All rights reserved.
+Copyright (c) 2016       Marcus Brinkmann. All rights reserved.
+Copyright (c) 2016       Edward Betts. All rights reserved.
+Copyright (c) 2016       Patrick Mazulo. All rights reserved.
+Copyright (c) 2017       Haochen Wu. All rights reserved.
+Copyright (c) 2017       Jon Lund Steffensen. All rights reserved.
+
+
+MIT License:
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+
+BSD License:
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* The name of the author may not be used to endorse or promote products
+  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1,3 @@
+include *.txt *.in *.rst
+recursive-include examples *.txt *.py
+recursive-include tests *.py
--- a/README.rst
+++ b/README.rst
@ -0,0 +1,789 @@
+==================
+pdfrw 0.4
+==================
+
+:Author: Patrick Maupin
+
+.. contents::
+    :backlinks: none
+
+.. sectnum::
+
+Introduction
+============
+
+**pdfrw** is a Python library and utility that reads and writes PDF files:
+
+* Version 0.4 is tested and works on Python 2.6, 2.7, 3.3, 3.4, 3.5, and 3.6
+* Operations include subsetting, merging, rotating, modifying metadata, etc.
+* The fastest pure Python PDF parser available
+* Has been used for years by a printer in pre-press production
+* Can be used with rst2pdf to faithfully reproduce vector images
+* Can be used either standalone, or in conjunction with `reportlab`__
+  to reuse existing PDFs in new ones
+* Permissively licensed
+
+__ http://www.reportlab.org/
+
+
+pdfrw will faithfully reproduce vector formats without
+rasterization, so the rst2pdf package has used pdfrw
+for PDF and SVG images by default since March 2010.
+
+pdfrw can also be used in conjunction with reportlab, in order
+to re-use portions of existing PDFs in new PDFs created with
+reportlab.
+
+
+Examples
+=========
+
+The library comes with several examples that show operation both with
+and without reportlab.
+
+
+All examples
+------------------
+
+The examples directory has a few scripts which use the library.
+Note that if these examples do not work with your PDF, you should
+try to use pdftk to uncompress and/or unencrypt them first.
+
+* `4up.py`__ will shrink pages down and place 4 of them on
+  each output page.
+* `alter.py`__ shows an example of modifying metadata, without
+  altering the structure of the PDF.
+* `booklet.py`__ shows an example of creating a 2-up output
+  suitable for printing and folding (e.g on tabloid size paper).
+* `cat.py`__ shows an example of concatenating multiple PDFs together.
+* `extract.py`__ will extract images and Form XObjects (embedded pages)
+  from existing PDFs to make them easier to use and refer to from
+  new PDFs (e.g. with reportlab or rst2pdf).
+* `poster.py`__ increases the size of a PDF so it can be printed
+  as a poster.
+* `print_two.py`__ Allows creation of 8.5 X 5.5" booklets by slicing
+  8.5 X 11" paper apart after printing.
+* `rotate.py`__ Rotates all or selected pages in a PDF.
+* `subset.py`__ Creates a new PDF with only a subset of pages from the
+  original.
+* `unspread.py`__ Takes a 2-up PDF, and splits out pages.
+* `watermark.py`__ Adds a watermark PDF image over or under all the pages
+  of a PDF.
+* `rl1/4up.py`__ Another 4up example, using reportlab canvas for output.
+* `rl1/booklet.py`__ Another booklet example, using reportlab canvas for
+  output.
+* `rl1/subset.py`__ Another subsetting example, using reportlab canvas for
+  output.
+* `rl1/platypus_pdf_template.py`__ Another watermarking example, using
+  reportlab canvas and generated output for the document.  Contributed
+  by user asannes.
+* `rl2`__ Experimental code for parsing graphics.  Needs work.
+* `subset_booklets.py`__ shows an example of creating a full printable pdf
+  version in a more professional and pratical way ( take a look at
+  http://www.wikihow.com/Bind-a-Book )
+
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/4up.py
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/alter.py
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/booklet.py
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/cat.py
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/extract.py
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/poster.py
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/print_two.py
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/rotate.py
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/subset.py
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/unspread.py
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/watermark.py
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/4up.py
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/booklet.py
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/subset.py
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/platypus_pdf_template.py
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl2/
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/subset_booklets.py
+
+Notes on selected examples
+------------------------------------
+
+Reorganizing pages and placing them two-up
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A printer with a fancy printer and/or a full-up copy of Acrobat can
+easily turn your small PDF into a little booklet (for example, print 4
+letter-sized pages on a single 11" x 17").
+
+But that assumes several things, including that the personnel know how
+to operate the hardware and software. `booklet.py`__ lets you turn your PDF
+into a preformatted booklet, to give them fewer chances to mess it up.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/booklet.py
+
+Adding or modifying metadata
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The `cat.py`__ example will accept multiple input files on the command
+line, concatenate them and output them to output.pdf, after adding some
+nonsensical metadata to the output PDF file.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/cat.py
+
+The `alter.py`__ example alters a single metadata item in a PDF,
+and writes the result to a new PDF.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/alter.py
+
+
+One difference is that, since **cat** is creating a new PDF structure,
+and **alter** is attempting to modify an existing PDF structure, the
+PDF produced by alter (and also by watermark.py) *should* be
+more faithful to the original (except for the desired changes).
+
+For example, the alter.py navigation should be left intact, whereas with
+cat.py it will be stripped.
+
+
+Rotating and doubling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you ever want to print something that is like a small booklet, but
+needs to be spiral bound, you either have to do some fancy rearranging,
+or just waste half your paper.
+
+The `print_two.py`__ example program will, for example, make two side-by-side
+copies each page of of your PDF on a each output sheet.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/print_two.py
+
+But, every other page is flipped, so that you can print double-sided and
+the pages will line up properly and be pre-collated.
+
+Graphics stream parsing proof of concept
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The `copy.py`__ script shows a simple example of reading in a PDF, and
+using the decodegraphics.py module to try to write the same information
+out to a new PDF through a reportlab canvas. (If you know about reportlab,
+you know that if you can faithfully render a PDF to a reportlab canvas, you
+can do pretty much anything else with that PDF you want.) This kind of
+low level manipulation should be done only if you really need to.
+decodegraphics is really more than a proof of concept than anything
+else. For most cases, just use the Form XObject capability, as shown in
+the examples/rl1/booklet.py demo.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl2/copy.py
+
+pdfrw philosophy
+==================
+
+Core library
+-------------
+
+The philosophy of the library portion of pdfrw is to provide intuitive
+functions to read, manipulate, and write PDF files.  There should be
+minimal leakage between abstraction layers, although getting useful
+work done makes "pure" functionality separation difficult.
+
+A key concept supported by the library is the use of Form XObjects,
+which allow easy embedding of pieces of one PDF into another.
+
+Addition of core support to the library is typically done carefully
+and thoughtfully, so as not to clutter it up with too many special
+cases.
+
+There are a lot of incorrectly formatted PDFs floating around; support
+for these is added in some cases.  The decision is often based on what
+acroread and okular do with the PDFs; if they can display them properly,
+then eventually pdfrw should, too, if it is not too difficult or costly.
+
+Contributions are welcome; one user has contributed some decompression
+filters and the ability to process PDF 1.5 stream objects.  Additional
+functionality that would obviously be useful includes additional
+decompression filters, the ability to process password-protected PDFs,
+and the ability to output linearized PDFs.
+
+Examples
+--------
+
+The philosophy of the examples is to provide small, easily-understood
+examples that showcase pdfrw functionality.
+
+
+PDF files and Python
+======================
+
+Introduction
+------------
+
+In general, PDF files conceptually map quite well to Python. The major
+objects to think about are:
+
+-  **strings**. Most things are strings. These also often decompose
+   naturally into
+-  **lists of tokens**. Tokens can be combined to create higher-level
+   objects like
+-  **arrays** and
+-  **dictionaries** and
+-  **Contents streams** (which can be more streams of tokens)
+
+Difficulties
+------------
+
+The apparent primary difficulty in mapping PDF files to Python is the
+PDF file concept of "indirect objects."  Indirect objects provide
+the efficiency of allowing a single piece of data to be referred to
+from more than one containing object, but probably more importantly,
+indirect objects provide a way to get around the chicken and egg
+problem of circular object references when mapping arbitrary data
+structures to files. To flatten out a circular reference, an indirect
+object is *referred to* instead of being *directly included* in another
+object. PDF files have a global mechanism for locating indirect objects,
+and they all have two reference numbers (a reference number and a
+"generation" number, in case you wanted to append to the PDF file
+rather than just rewriting the whole thing).
+
+pdfrw automatically handles indirect references on reading in a PDF
+file. When pdfrw encounters an indirect PDF file object, the
+corresponding Python object it creates will have an 'indirect' attribute
+with a value of True. When writing a PDF file, if you have created
+arbitrary data, you just need to make sure that circular references are
+broken up by putting an attribute named 'indirect' which evaluates to
+True on at least one object in every cycle.
+
+Another PDF file concept that doesn't quite map to regular Python is a
+"stream". Streams are dictionaries which each have an associated
+unformatted data block. pdfrw handles streams by placing a special
+attribute on a subclassed dictionary.
+
+Usage Model
+-----------
+
+The usage model for pdfrw treats most objects as strings (it takes their
+string representation when writing them to a file). The two main
+exceptions are the PdfArray object and the PdfDict object.
+
+PdfArray is a subclass of list with two special features.  First,
+an 'indirect' attribute allows a PdfArray to be written out as
+an indirect PDF object.  Second, pdfrw reads files lazily, so
+PdfArray knows about, and resolves references to other indirect
+objects on an as-needed basis.
+
+PdfDict is a subclass of dict that also has an indirect attribute
+and lazy reference resolution as well.  (And the subclassed
+IndirectPdfDict has indirect automatically set True).
+
+But PdfDict also has an optional associated stream. The stream object
+defaults to None, but if you assign a stream to the dict, it will
+automatically set the PDF /Length attribute for the dictionary.
+
+Finally, since PdfDict instances are indexed by PdfName objects (which
+always start with a /) and since most (all?) standard Adobe PdfName
+objects use names formatted like "/CamelCase", it makes sense to allow
+access to dictionary elements via object attribute accesses as well as
+object index accesses. So usage of PdfDict objects is normally via
+attribute access, although non-standard names (though still with a
+leading slash) can be accessed via dictionary index lookup.
+
+Reading PDFs
+~~~~~~~~~~~~~~~
+
+The PdfReader object is a subclass of PdfDict, which allows easy access
+to an entire document::
+
+    >>> from pdfrw import PdfReader
+    >>> x = PdfReader('source.pdf')
+    >>> x.keys()
+    ['/Info', '/Size', '/Root']
+    >>> x.Info
+    {'/Producer': '(cairo 1.8.6 (http://cairographics.org))',
+     '/Creator': '(cairo 1.8.6 (http://cairographics.org))'}
+    >>> x.Root.keys()
+    ['/Type', '/Pages']
+
+Info, Size, and Root are retrieved from the trailer of the PDF file.
+
+In addition to the tree structure, pdfrw creates a special attribute
+named *pages*, that is a list of all the pages in the document. pdfrw
+creates the *pages* attribute as a simplification for the user, because
+the PDF format allows arbitrarily complicated nested dictionaries to
+describe the page order. Each entry in the *pages* list is the PdfDict
+object for one of the pages in the file, in order.
+
+::
+
+    >>> len(x.pages)
+    1
+    >>> x.pages[0]
+    {'/Parent': {'/Kids': [{...}], '/Type': '/Pages', '/Count': '1'},
+     '/Contents': {'/Length': '11260', '/Filter': None},
+     '/Resources': ... (Lots more stuff snipped)
+    >>> x.pages[0].Contents
+    {'/Length': '11260', '/Filter': None}
+    >>> x.pages[0].Contents.stream
+    'q\n1 1 1 rg /a0 gs\n0 0 0 RG 0.657436
+      w\n0 J\n0 j\n[] 0.0 d\n4 M q' ... (Lots more stuff snipped)
+
+Writing PDFs
+~~~~~~~~~~~~~~~
+
+As you can see, it is quite easy to dig down into a PDF document. But
+what about when it's time to write it out?
+
+::
+
+    >>> from pdfrw import PdfWriter
+    >>> y = PdfWriter()
+    >>> y.addpage(x.pages[0])
+    >>> y.write('result.pdf')
+
+That's all it takes to create a new PDF. You may still need to read the
+`Adobe PDF reference manual`__ to figure out what needs to go *into*
+the PDF, but at least you don't have to sweat actually building it
+and getting the file offsets right.
+
+__ http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf
+
+Manipulating PDFs in memory
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For the most part, pdfrw tries to be agnostic about the contents of
+PDF files, and support them as containers, but to do useful work,
+something a little higher-level is required, so pdfrw works to
+understand a bit about the contents of the containers.  For example:
+
+-  PDF pages. pdfrw knows enough to find the pages in PDF files you read
+   in, and to write a set of pages back out to a new PDF file.
+-  Form XObjects. pdfrw can take any page or rectangle on a page, and
+   convert it to a Form XObject, suitable for use inside another PDF
+   file.  It knows enough about these to perform scaling, rotation,
+   and positioning.
+-  reportlab objects. pdfrw can recursively create a set of reportlab
+   objects from its internal object format. This allows, for example,
+   Form XObjects to be used inside reportlab, so that you can reuse
+   content from an existing PDF file when building a new PDF with
+   reportlab.
+
+There are several examples that demonstrate these features in
+the example code directory.
+
+Missing features
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Even as a pure PDF container library, pdfrw comes up a bit short. It
+does not currently support:
+
+-  Most compression/decompression filters
+-  encryption
+
+`pdftk`__ is a wonderful command-line
+tool that can convert your PDFs to remove encryption and compression.
+However, in most cases, you can do a lot of useful work with PDFs
+without actually removing compression, because only certain elements
+inside PDFs are actually compressed.
+
+__ https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/
+
+Library internals
+==================
+
+Introduction
+------------
+
+**pdfrw** currently consists of 19 modules organized into a main
+package and one sub-package.
+
+The `__init.py__`__ module does the usual thing of importing a few
+major attributes from some of the submodules, and the `errors.py`__
+module supports logging and exception generation.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/__init__.py
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/errors.py
+
+
+PDF object model support
+--------------------------
+
+The `objects`__ sub-package contains one module for each of the
+internal representations of the kinds of basic objects that exist
+in a PDF file, with the `objects/__init__.py`__ module in that
+package simply gathering them up and making them available to the
+main pdfrw package.
+
+One feature that all the PDF object classes have in common is the
+inclusion of an 'indirect' attribute. If 'indirect' exists and evaluates
+to True, then when the object is written out, it is written out as an
+indirect object. That is to say, it is addressable in the PDF file, and
+could be referenced by any number (including zero) of container objects.
+This indirect object capability saves space in PDF files by allowing
+objects such as fonts to be referenced from multiple pages, and also
+allows PDF files to contain internal circular references.  This latter
+capability is used, for example, when each page object has a "parent"
+object in its dictionary.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/__init__.py
+
+Ordinary objects
+~~~~~~~~~~~~~~~~
+
+The `objects/pdfobject.py`__ module contains the PdfObject class, which is
+a subclass of str, and is the catch-all object for any PDF file elements
+that are not explicitly represented by other objects, as described below.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/pdfobject.py
+
+Name objects
+~~~~~~~~~~~~
+
+The `objects/pdfname.py`__ module contains the PdfName singleton object,
+which will convert a string into a PDF name by prepending a slash. It can
+be used either by calling it or getting an attribute, e.g.::
+
+    PdfName.Rotate == PdfName('Rotate') == PdfObject('/Rotate')
+
+In the example above, there is a slight difference between the objects
+returned from PdfName, and the object returned from PdfObject.  The
+PdfName objects are actually objects of class "BasePdfName".  This
+is important, because only these may be used as keys in PdfDict objects.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/pdfname.py
+
+String objects
+~~~~~~~~~~~~~~
+
+The `objects/pdfstring.py`__
+module contains the PdfString class, which is a subclass of str that is
+used to represent encoded strings in a PDF file. The class has encode
+and decode methods for the strings.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/pdfstring.py
+
+
+Array objects
+~~~~~~~~~~~~~
+
+The `objects/pdfarray.py`__
+module contains the PdfArray class, which is a subclass of list that is
+used to represent arrays in a PDF file. A regular list could be used
+instead, but use of the PdfArray class allows for an indirect attribute
+to be set, and also allows for proxying of unresolved indirect objects
+(that haven't been read in yet) in a manner that is transparent to pdfrw
+clients.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/pdfarray.py
+
+Dict objects
+~~~~~~~~~~~~
+
+The `objects/pdfdict.py`__
+module contains the PdfDict class, which is a subclass of dict that is
+used to represent dictionaries in a PDF file. A regular dict could be
+used instead, but the PdfDict class matches the requirements of PDF
+files more closely:
+
+* Transparent (from the library client's viewpoint) proxying
+  of unresolved indirect objects
+* Return of None for non-existent keys (like dict.get)
+* Mapping of attribute accesses to the dict itself
+  (pdfdict.Foo == pdfdict[NameObject('Foo')])
+* Automatic management of following stream and /Length attributes
+  for content dictionaries
+* Indirect attribute
+* Other attributes may be set for private internal use of the
+  library and/or its clients.
+* Support for searching parent dictionaries for PDF "inheritable"
+  attributes.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/pdfdict.py
+
+If a PdfDict has an associated data stream in the PDF file, the stream
+is accessed via the 'stream' (all lower-case) attribute.  Setting the
+stream attribute on the PdfDict will automatically set the /Length attribute
+as well.  If that is not what is desired (for example if the the stream
+is compressed), then _stream (same name with an underscore) may be used
+to associate the stream with the PdfDict without setting the length.
+
+To set private attributes (that will not be written out to a new PDF
+file) on a dictionary, use the 'private' attribute::
+
+    mydict.private.foo = 1
+
+Once the attribute is set, it may be accessed directly as an attribute
+of the dictionary::
+
+    foo = mydict.foo
+
+Some attributes of PDF pages are "inheritable."  That is, they may
+belong to a parent dictionary (or a parent of a parent dictionary, etc.)
+The "inheritable" attribute allows for easy discovery of these::
+
+    mediabox = mypage.inheritable.MediaBox
+
+
+Proxy objects
+~~~~~~~~~~~~~
+
+The `objects/pdfindirect.py`__
+module contains the PdfIndirect class, which is a non-transparent proxy
+object for PDF objects that have not yet been read in and resolved from
+a file. Although these are non-transparent inside the library, client code
+should never see one of these -- they exist inside the PdfArray and PdfDict
+container types, but are resolved before being returned to a client of
+those types.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/pdfindirect.py
+
+
+File reading, tokenization and parsing
+--------------------------------------
+
+`pdfreader.py`__
+contains the PdfReader class, which can read a PDF file (or be passed a
+file object or already read string) and parse it. It uses the PdfTokens
+class in `tokens.py`__  for low-level tokenization.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/pdfreader.py
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/tokens.py
+
+
+The PdfReader class does not, in general, parse into containers (e.g.
+inside the content streams). There is a proof of concept for doing that
+inside the examples/rl2 subdirectory, but that is slow and not well-developed,
+and not useful for most applications.
+
+An instance of the PdfReader class is an instance of a PdfDict -- the
+trailer dictionary of the PDF file, to be exact.  It will have a private
+attribute set on it that is named 'pages' that is a list containing all
+the pages in the file.
+
+When instantiating a PdfReader object, there are options available
+for decompressing all the objects in the file.  pdfrw does not currently
+have very many options for decompression, so this is not all that useful,
+except in the specific case of compressed object streams.
+
+Also, there are no options for decryption yet.  If you have PDF files
+that are encrypted or heavily compressed, you may find that using another
+program like pdftk on them can make them readable by pdfrw.
+
+In general, the objects are read from the file lazily, but this is not
+currently true with compressed object streams -- all of these are decompressed
+and read in when the PdfReader is instantiated.
+
+
+File output
+-----------
+
+`pdfwriter.py`__
+contains the PdfWriter class, which can create and output a PDF file.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/pdfwriter.py
+
+There are a few options available when creating and using this class.
+
+In the simplest case, an instance of PdfWriter is instantiated, and
+then pages are added to it from one or more source files (or created
+programmatically), and then the write method is called to dump the
+results out to a file.
+
+If you have a source PDF and do not want to disturb the structure
+of it too badly, then you may pass its trailer directly to PdfWriter
+rather than letting PdfWriter construct one for you.  There is an
+example of this (alter.py) in the examples directory.
+
+
+Advanced features
+-----------------
+
+`buildxobj.py`__
+contains functions to build Form XObjects out of pages or rectangles on
+pages.  These may be reused in new PDFs essentially as if they were images.
+
+buildxobj is careful to cache any page used so that it only appears in
+the output once.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/buildxobj.py
+
+
+`toreportlab.py`__
+provides the makerl function, which will translate pdfrw objects into a
+format which can be used with `reportlab <http://www.reportlab.org/>`__.
+It is normally used in conjunction with buildxobj, to be able to reuse
+parts of existing PDFs when using reportlab.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/toreportlab.py
+
+
+`pagemerge.py`__ builds on the foundation laid by buildxobj.  It
+contains classes to create a new page (or overlay an existing page)
+using one or more rectangles from other pages.  There are examples
+showing its use for watermarking, scaling, 4-up output, splitting
+each page in 2, etc.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/pagemerge.py
+
+`findobjs.py`__ contains code that can find specific kinds of objects
+inside a PDF file.  The extract.py example uses this module to create
+a new PDF that places each image and Form XObject from a source PDF onto
+its own page, e.g. for easy reuse with some of the other examples or
+with reportlab.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/findobjs.py
+
+
+Miscellaneous
+----------------
+
+`compress.py`__ and `uncompress.py`__
+contains compression and decompression functions. Very few filters are
+currently supported, so an external tool like pdftk might be good if you
+require the ability to decompress (or, for that matter, decrypt) PDF
+files.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/compress.py
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/uncompress.py
+
+
+`py23_diffs.py`__ contains code to help manage the differences between
+Python 2 and Python 3.
+
+__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/py23_diffs.py
+
+Testing
+===============
+
+The tests associated with pdfrw require a large number of PDFs,
+which are not distributed with the library.
+
+To run the tests:
+
+* Download or clone the full package from github.com/pmaupin/pdfrw
+* cd into the tests directory, and then clone the package
+  github.com/pmaupin/static_pdfs into a subdirectory (also named
+  static_pdfs).
+* Now the tests may be run from that directory using unittest, or
+  py.test, or nose.
+* travisci is used at github, and runs the tests with py.test
+
+Other libraries
+=====================
+
+Pure Python
+-----------
+
+-  `reportlab <http://www.reportlab.org/>`__
+
+    reportlab is must-have software if you want to programmatically
+    generate arbitrary PDFs.
+
+-  `pyPdf <https://github.com/mstamy2/PyPDF2>`__
+
+    pyPdf is, in some ways, very full-featured. It can do decompression
+    and decryption and seems to know a lot about items inside at least
+    some kinds of PDF files. In comparison, pdfrw knows less about
+    specific PDF file features (such as metadata), but focuses on trying
+    to have a more Pythonic API for mapping the PDF file container
+    syntax to Python, and (IMO) has a simpler and better PDF file
+    parser.  The Form XObject capability of pdfrw means that, in many
+    cases, it does not actually need to decompress objects -- they
+    can be left compressed.
+
+-  `pdftools <http://www.boddie.org.uk/david/Projects/Python/pdftools/index.html>`__
+
+    pdftools feels large and I fell asleep trying to figure out how it
+    all fit together, but many others have done useful things with it.
+
+-  `pagecatcher <http://www.reportlab.com/docs/pagecatcher-ds.pdf>`__
+
+    My understanding is that pagecatcher would have done exactly what I
+    wanted when I built pdfrw. But I was on a zero budget, so I've never
+    had the pleasure of experiencing pagecatcher. I do, however, use and
+    like `reportlab <http://www.reportlab.org/>`__ (open source, from
+    the people who make pagecatcher) so I'm sure pagecatcher is great,
+    better documented and much more full-featured than pdfrw.
+
+-  `pdfminer <http://www.unixuser.org/~euske/python/pdfminer/index.html>`__
+
+    This looks like a useful, actively-developed program. It is quite
+    large, but then, it is trying to actively comprehend a full PDF
+    document. From the website:
+
+    "PDFMiner is a suite of programs that help extracting and analyzing
+    text data of PDF documents. Unlike other PDF-related tools, it
+    allows to obtain the exact location of texts in a page, as well as
+    other extra information such as font information or ruled lines. It
+    includes a PDF converter that can transform PDF files into other
+    text formats (such as HTML). It has an extensible PDF parser that
+    can be used for other purposes instead of text analysis."
+
+non-pure-Python libraries
+-------------------------
+
+-  `pyPoppler <https://launchpad.net/poppler-python/>`__ can read PDF
+   files.
+-  `pycairo <http://www.cairographics.org/pycairo/>`__ can write PDF
+   files.
+-  `PyMuPDF <https://github.com/rk700/PyMuPDF>`_ high performance rendering
+   of PDF, (Open)XPS, CBZ and EPUB
+
+Other tools
+-----------
+
+-  `pdftk <https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/>`__ is a wonderful command
+   line tool for basic PDF manipulation. It complements pdfrw extremely
+   well, supporting many operations such as decryption and decompression
+   that pdfrw cannot do.
+-  `MuPDF <http://www.mupdf.com/>`_ is a free top performance PDF, (Open)XPS, CBZ and EPUB rendering library
+   that also comes with some command line tools. One of those, ``mutool``, has big overlaps with pdftk's - 
+   except it is up to 10 times faster.
+
+Release information
+=======================
+
+Revisions:
+
+0.4 -- Released 18 September, 2017
+
+    - Python 3.6 added to test matrix
+    - Proper unicode support for text strings in PDFs added
+    - buildxobj fixes allow better support creating form XObjects
+      out of compressed pages in some cases
+    - Compression fixes for Python 3+
+    - New subset_booklets.py example
+    - Bug with non-compressed indices into compressed object streams fixed
+    - Bug with distinguishing compressed object stream first objects fixed
+    - Better error reporting added for some invalid PDFs (e.g. when reading
+      past the end of file)
+    - Better scrubbing of old bookmark information when writing PDFs, to
+      remove dangling references
+    - Refactoring of pdfwriter, including updating API, to allow future
+      enhancements for things like incremental writing
+    - Minor tokenizer speedup
+    - Some flate decompressor bugs fixed
+    - Compression and decompression tests added
+    - Tests for new unicode handling added
+    - PdfReader.readpages() recursion error (issue #92) fixed.
+    - Initial crypt filter support added
+
+
+0.3 -- Released 19 October, 2016.
+
+    - Python 3.5 added to test matrix
+    - Better support under Python 3.x for in-memory PDF file-like objects
+    - Some pagemerge and Unicode patches added
+    - Changes to logging allow better coexistence with other packages
+    - Fix for "from pdfrw import \*"
+    - New fancy_watermark.py example shows off capabilities of pagemerge.py
+    - metadata.py example renamed to cat.py
+
+
+0.2 -- Released 21 June, 2015.  Supports Python 2.6, 2.7, 3.3, and 3.4.
+
+    - Several bugs have been fixed
+    - New regression test functionally tests core with dozens of
+      PDFs, and also tests examples.
+    - Core has been ported and tested on Python3 by round-tripping
+      several difficult files and observing binary matching results
+      across the different Python versions.
+    - Still only minimal support for compression and no support
+      for encryption or newer PDF features.  (pdftk is useful
+      to put PDFs in a form that pdfrw can use.)
+
+0.1 -- Released to PyPI in 2012.  Supports Python 2.5 - 2.7
+
--- a/examples/4up.py
+++ b/examples/4up.py
@ -0,0 +1,33 @@
+#!/usr/bin/env python
+
+'''
+usage:   4up.py my.pdf
+
+Creates 4up.my.pdf with a single output page for every
+4 input pages.
+'''
+
+import sys
+import os
+
+from pdfrw import PdfReader, PdfWriter, PageMerge
+
+
+def get4(srcpages):
+    scale = 0.5
+    srcpages = PageMerge() + srcpages
+    x_increment, y_increment = (scale * i for i in srcpages.xobj_box[2:])
+    for i, page in enumerate(srcpages):
+        page.scale(scale)
+        page.x = x_increment if i & 1 else 0
+        page.y = 0 if i & 2 else y_increment
+    return srcpages.render()
+
+
+inpfn, = sys.argv[1:]
+outfn = '4up.' + os.path.basename(inpfn)
+pages = PdfReader(inpfn).pages
+writer = PdfWriter(outfn)
+for index in range(0, len(pages), 4):
+    writer.addpage(get4(pages[index:index + 4]))
+writer.write()
--- a/examples/README.txt
+++ b/examples/README.txt
@ -0,0 +1,32 @@
+Example programs:
+
+4up.py -- Prints pages four-up
+
+alter.py -- Simple example of making a very slight modification to a PDF.
+
+booklet.py -- Converts a PDF into a booklet.
+
+cat.py -- Concatenates multiple PDFs, adds metadata.
+
+poster.py -- Changes the size of a PDF to create a poster
+
+print_two.py  -- this is used when printing two cut-down copies on a single sheet of paper (double-sided)  Requires uncompressed PDF.
+
+rotate.py -- This will rotate selected ranges of pages within a document.
+
+subset.py -- This will retrieve a subset of pages from a document.
+
+watermark.py  -- Adds a watermark to a PDF
+
+rl1/4up.py -- Same as 4up.py, using reportlab for output.  Next simplest reportlab example.
+
+rl1/booklet.py -- Version of print_booklet using reportlab for output.
+
+rl1/platypus_pdf_template.py -- Example using a PDF page as a watermark background with reportlab.
+
+rl1/subset.py -- Same as subset.py, using reportlab for output.  Simplest reportlab example.
+
+rl2/copy.py -- example of how you could parse a graphics stream and then use reportlab for output.
+               Works on a few different PDFs, probably not a suitable starting point for real
+               production work without a lot of work on the library functions.
+
--- a/examples/alter.py
+++ b/examples/alter.py
@ -0,0 +1,22 @@
+#!/usr/bin/env python
+
+'''
+usage:   alter.py my.pdf
+
+Creates alter.my.pdf
+
+Demonstrates making a slight alteration to a preexisting PDF file.
+
+'''
+
+import sys
+import os
+
+from pdfrw import PdfReader, PdfWriter
+
+inpfn, = sys.argv[1:]
+outfn = 'alter.' + os.path.basename(inpfn)
+
+trailer = PdfReader(inpfn)
+trailer.Info.Title = 'My New Title Goes Here'
+PdfWriter(outfn, trailer=trailer).write()
--- a/examples/booklet.py
+++ b/examples/booklet.py
@ -0,0 +1,56 @@
+#!/usr/bin/env python
+
+'''
+usage:   booklet.py [-p] my.pdf
+
+Creates booklet.my.pdf
+
+Pages organized in a form suitable for booklet printing, e.g.
+to print 4 8.5x11 pages using a single 11x17 sheet (double-sided).
+
+The output would be using the same type of sheet
+and you can get up to 3 blank sides if -p is enabled.
+
+Otherwise the two sides in the middle will be in original page size
+and you can have 1 blank sides at most.
+
+'''
+
+import os
+import argparse
+
+from pdfrw import PdfReader, PdfWriter, PageMerge
+
+
+def fixpage(*pages):
+    result = PageMerge() + (x for x in pages if x is not None)
+    result[-1].x += result[0].w
+    return result.render()
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("input", help="Input pdf file name")
+parser.add_argument("-p", "--padding", action = "store_true",
+                    help="Padding the document so that all pages use the same type of sheet")
+args = parser.parse_args()
+
+inpfn = args.input
+outfn = 'booklet.' + os.path.basename(inpfn)
+ipages = PdfReader(inpfn).pages
+
+if args.padding:
+    pad_to = 4
+else:
+    pad_to = 2
+
+# Make sure we have a correct number of sides
+ipages += [None]*(-len(ipages)%pad_to)
+
+opages = []
+while len(ipages) > 2:
+    opages.append(fixpage(ipages.pop(), ipages.pop(0)))
+    opages.append(fixpage(ipages.pop(0), ipages.pop()))
+
+opages += ipages
+
+PdfWriter(outfn).addpages(opages).write()
--- a/examples/cat.py
+++ b/examples/cat.py
@ -0,0 +1,35 @@
+#!/usr/bin/env python
+
+'''
+usage:   cat.py <first.pdf> [<next.pdf> ...]
+
+Creates cat.<first.pdf>
+
+This file demonstrates two features:
+
+1) Concatenating multiple input PDFs.
+
+2) adding metadata to the PDF.
+
+'''
+
+import sys
+import os
+
+from pdfrw import PdfReader, PdfWriter, IndirectPdfDict
+
+inputs = sys.argv[1:]
+assert inputs
+outfn = 'cat.' + os.path.basename(inputs[0])
+
+writer = PdfWriter()
+for inpfn in inputs:
+    writer.addpages(PdfReader(inpfn).pages)
+
+writer.trailer.Info = IndirectPdfDict(
+    Title='your title goes here',
+    Author='your name goes here',
+    Subject='what is it all about?',
+    Creator='some script goes here',
+)
+writer.write(outfn)
--- a/examples/extract.py
+++ b/examples/extract.py
@ -0,0 +1,27 @@
+#!/usr/bin/env python
+
+'''
+usage:   extract.py <some.pdf>
+
+Locates Form XObjects and Image XObjects within the PDF,
+and creates a new PDF containing these -- one per page.
+
+Resulting file will be named extract.<some.pdf>
+
+'''
+
+import sys
+import os
+
+from pdfrw import PdfReader, PdfWriter
+from pdfrw.findobjs import page_per_xobj
+
+
+inpfn, = sys.argv[1:]
+outfn = 'extract.' + os.path.basename(inpfn)
+pages = list(page_per_xobj(PdfReader(inpfn).pages, margin=0.5*72))
+if not pages:
+    raise IndexError("No XObjects found")
+writer = PdfWriter(outfn)
+writer.addpages(pages)
+writer.write()
--- a/examples/fancy_watermark.py
+++ b/examples/fancy_watermark.py
@ -0,0 +1,105 @@
+#!/usr/bin/env python
+
+'''
+Enhanced example of watermarking using form xobjects (pdfrw).
+
+usage:   fancy_watermark.py [-u] my.pdf single_page.pdf
+
+Creates watermark.my.pdf, with every page overlaid with
+first page from single_page.pdf.  If -u is selected, watermark
+will be placed underneath page (painted first).
+
+The stock watermark.py program assumes all pages are the same
+size.  This example deals with pages of differing sizes in order
+to show some concepts of positioning and scaling.
+
+This version applies the watermark such that the upper right
+corner of the watermark is at the upper right corner of the
+document page for odd pages, and at the upper left corner
+of the document page for even pages, for each page of the
+document.
+
+It also rescales the size of the watermark if the watermark
+is too wide for the page.
+
+These scaling and positioning adjustments can easily
+be customized for any particular application.
+
+To handle documents with different page sizes, a cache is
+maintained of a modified intermediate watermark object
+for each page size.
+'''
+
+import sys
+import os
+
+from pdfrw import PdfReader, PdfWriter, PageMerge
+
+# Get all the filenames
+
+argv = sys.argv[1:]
+underneath = '-u' in argv
+if underneath:
+    del argv[argv.index('-u')]
+inpfn, wmarkfn = argv
+outfn = 'watermark.' + os.path.basename(inpfn)
+
+# Open both the source files
+wmark_trailer = PdfReader(wmarkfn)
+trailer = PdfReader(inpfn)
+
+# Handle different sized pages in same document with
+# a memoization cache, so we don't create more watermark
+# objects than we need to (typically only one per document).
+
+wmark_page = wmark_trailer.pages[0]
+wmark_cache = {}
+
+# Process every page
+for pagenum, page in enumerate(trailer.pages, 1):
+
+    # Get the media box of the page, and see
+    # if we have a matching watermark in the cache
+    mbox = tuple(float(x) for x in page.MediaBox)
+    odd = pagenum & 1
+    key = mbox, odd
+    wmark = wmark_cache.get(key)
+    if wmark is None:
+
+        # Create and cache a new watermark object.
+        wmark = wmark_cache[key] = PageMerge().add(wmark_page)[0]
+
+        # The math is more complete than it probably needs to be,
+        # because the origin of all pages is almost always (0, 0).
+        # Nonetheless, we illustrate all the values and their names.
+
+        page_x, page_y, page_x1, page_y1 = mbox
+        page_w = page_x1 - page_x
+        page_h = page_y1 - page_y  # For illustration, not used
+
+        # Scale the watermark if it is too wide for the page
+        # (Could do the same for height instead if needed)
+        if wmark.w > page_w:
+            wmark.scale(1.0 * page_w / wmark.w)
+
+        # Always put watermark at the top of the page
+        # (but see horizontal positioning for other ideas)
+        wmark.y += page_y1 - wmark.h
+
+        # For odd pages, put it at the left of the page,
+        # and for even pages, put it on the right of the page.
+        if odd:
+            wmark.x = page_x
+        else:
+            wmark.x += page_x1 - wmark.w
+
+        # Optimize the case where the watermark is same width
+        # as page.
+        if page_w == wmark.w:
+            wmark_cache[mbox, not odd] = wmark
+
+    # Add the watermark to the page
+    PageMerge(page).add(wmark, prepend=underneath).render()
+
+# Write out the destination file
+PdfWriter(outfn, trailer=trailer).write()
--- a/examples/poster.py
+++ b/examples/poster.py
@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+'''
+usage:   poster.py my.pdf
+
+Shows how to change the size on a PDF.
+
+Motivation:
+
+My daughter needed to create a 48" x 36" poster, but her Mac
+version of Powerpoint only wanted to output 8.5" x 11" for
+some reason.
+
+So she did an 8.5x11" output with 0.5" margin all around
+(actual size of useful area 7.5x10") and we scaled it
+up by 4.8.
+
+We also copy the Info dict to the new PDF.
+
+'''
+
+import sys
+import os
+
+from pdfrw import PdfReader, PdfWriter, PageMerge, IndirectPdfDict
+
+
+def adjust(page, margin=36, scale=4.8):
+    info = PageMerge().add(page)
+    x1, y1, x2, y2 = info.xobj_box
+    viewrect = (margin, margin, x2 - x1 - 2 * margin, y2 - y1 - 2 * margin)
+    page = PageMerge().add(page, viewrect=viewrect)
+    page[0].scale(scale)
+    return page.render()
+
+
+inpfn, = sys.argv[1:]
+outfn = 'poster.' + os.path.basename(inpfn)
+reader = PdfReader(inpfn)
+writer = PdfWriter(outfn)
+writer.addpage(adjust(reader.pages[0]))
+writer.trailer.Info = IndirectPdfDict(reader.Info or {})
+writer.write()
--- a/examples/print_two.py
+++ b/examples/print_two.py
@ -0,0 +1,32 @@
+#!/usr/bin/env python
+
+'''
+usage:   print_two.py my.pdf
+
+Creates print_two.my.pdf
+
+This is only useful when you can cut down sheets of paper to make two
+small documents.  Works for double-sided only right now.
+'''
+
+import sys
+import os
+
+from pdfrw import PdfReader, PdfWriter, PageMerge
+
+
+def fixpage(page, count=[0]):
+    count[0] += 1
+    oddpage = (count[0] & 1)
+
+    result = PageMerge()
+    for rotation in (180 + 180 * oddpage, 180 * oddpage):
+        result.add(page, rotate=rotation)
+    result[1].x = result[0].w
+    return result.render()
+
+
+inpfn, = sys.argv[1:]
+outfn = 'print_two.' + os.path.basename(inpfn)
+pages = PdfReader(inpfn).pages
+PdfWriter(outfn).addpages(fixpage(x) for x in pages).write()
--- a/examples/rl1/4up.py
+++ b/examples/rl1/4up.py
@ -0,0 +1,56 @@
+#!/usr/bin/env python
+
+'''
+usage:   4up.py my.pdf
+
+
+Uses Form XObjects and reportlab to create 4up.my.pdf.
+
+Demonstrates use of pdfrw with reportlab.
+
+'''
+
+import sys
+import os
+
+from reportlab.pdfgen.canvas import Canvas
+
+from pdfrw import PdfReader
+from pdfrw.buildxobj import pagexobj
+from pdfrw.toreportlab import makerl
+
+
+def addpage(canvas, allpages):
+    pages = allpages[:4]
+    del allpages[:4]
+
+    x_max = max(page.BBox[2] for page in pages)
+    y_max = max(page.BBox[3] for page in pages)
+
+    canvas.setPageSize((x_max, y_max))
+
+    for index, page in enumerate(pages):
+        x = x_max * (index & 1) / 2.0
+        y = y_max * (index <= 1) / 2.0
+        canvas.saveState()
+        canvas.translate(x, y)
+        canvas.scale(0.5, 0.5)
+        canvas.doForm(makerl(canvas, page))
+        canvas.restoreState()
+    canvas.showPage()
+
+
+def go(argv):
+    inpfn, = argv
+    outfn = '4up.' + os.path.basename(inpfn)
+
+    pages = PdfReader(inpfn).pages
+    pages = [pagexobj(x) for x in pages]
+    canvas = Canvas(outfn)
+
+    while pages:
+        addpage(canvas, pages)
+    canvas.save()
+
+if __name__ == '__main__':
+    go(sys.argv[1:])
--- a/examples/rl1/README.txt
+++ b/examples/rl1/README.txt
@ -0,0 +1,9 @@
+This directory contains example scripts which read in PDFs
+and convert pages to PDF Form XObjects using pdfrw, and then
+write out the PDFs using reportlab.
+
+The examples, from easiest to hardest, are:
+
+subset.py -- prints a subset of pages
+4up.py -- prints pages 4-up
+booklet.py -- creates a booklet out of the pages
--- a/examples/rl1/booklet.py
+++ b/examples/rl1/booklet.py
@ -0,0 +1,68 @@
+#!/usr/bin/env python
+
+'''
+usage:   booklet.py my.pdf
+
+
+Uses Form XObjects and reportlab to create booklet.my.pdf.
+
+Demonstrates use of pdfrw with reportlab.
+
+'''
+
+import sys
+import os
+
+from reportlab.pdfgen.canvas import Canvas
+
+from pdfrw import PdfReader
+from pdfrw.buildxobj import pagexobj
+from pdfrw.toreportlab import makerl
+
+
+def read_and_double(inpfn):
+    pages = PdfReader(inpfn).pages
+    pages = [pagexobj(x) for x in pages]
+    if len(pages) & 1:
+        pages.append(pages[0])  # Sentinel -- get same size for back as front
+
+    xobjs = []
+    while len(pages) > 2:
+        xobjs.append((pages.pop(), pages.pop(0)))
+        xobjs.append((pages.pop(0), pages.pop()))
+    xobjs += [(x,) for x in pages]
+    return xobjs
+
+
+def make_pdf(outfn, xobjpairs):
+    canvas = Canvas(outfn)
+    for xobjlist in xobjpairs:
+        x = y = 0
+        for xobj in xobjlist:
+            x += xobj.BBox[2]
+            y = max(y, xobj.BBox[3])
+
+        canvas.setPageSize((x, y))
+
+        # Handle blank back page
+        if len(xobjlist) > 1 and xobjlist[0] == xobjlist[-1]:
+            xobjlist = xobjlist[:1]
+            x = xobjlist[0].BBox[2]
+        else:
+            x = 0
+        y = 0
+
+        for xobj in xobjlist:
+            canvas.saveState()
+            canvas.translate(x, y)
+            canvas.doForm(makerl(canvas, xobj))
+            canvas.restoreState()
+            x += xobj.BBox[2]
+        canvas.showPage()
+    canvas.save()
+
+
+inpfn, = sys.argv[1:]
+outfn = 'booklet.' + os.path.basename(inpfn)
+
+make_pdf(outfn, read_and_double(inpfn))
--- a/examples/rl1/platypus_pdf_template.py
+++ b/examples/rl1/platypus_pdf_template.py
@ -0,0 +1,108 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+usage: platypus_pdf_template.py source.pdf
+
+Creates platypus.source.pdf
+
+Example of using pdfrw to use page 1 of a source PDF as the background
+for other pages programmatically generated with Platypus.
+
+Contributed by user asannes
+
+"""
+import sys
+import os
+
+from reportlab.platypus import PageTemplate, BaseDocTemplate, Frame
+from reportlab.platypus import NextPageTemplate, Paragraph, PageBreak
+from reportlab.platypus.tableofcontents import TableOfContents
+from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+from reportlab.rl_config import defaultPageSize
+from reportlab.lib.units import inch
+from reportlab.graphics import renderPDF
+
+from pdfrw import PdfReader
+from pdfrw.buildxobj import pagexobj
+from pdfrw.toreportlab import makerl
+
+PAGE_WIDTH = defaultPageSize[0]
+PAGE_HEIGHT = defaultPageSize[1]
+
+
+class MyTemplate(PageTemplate):
+    """The kernel of this example, where we use pdfrw to fill in the
+    background of a page before writing to it.  This could be used to fill
+    in a water mark or similar."""
+
+    def __init__(self, pdf_template_filename, name=None):
+        frames = [Frame(
+            0.85 * inch,
+            0.5 * inch,
+            PAGE_WIDTH - 1.15 * inch,
+            PAGE_HEIGHT - (1.5 * inch)
+            )]
+        PageTemplate.__init__(self, name, frames)
+        # use first page as template
+        page = PdfReader(pdf_template_filename).pages[0]
+        self.page_template = pagexobj(page)
+        # Scale it to fill the complete page
+        self.page_xscale = PAGE_WIDTH/self.page_template.BBox[2]
+        self.page_yscale = PAGE_HEIGHT/self.page_template.BBox[3]
+
+    def beforeDrawPage(self, canvas, doc):
+        """Draws the background before anything else"""
+        canvas.saveState()
+        rl_obj = makerl(canvas, self.page_template)
+        canvas.scale(self.page_xscale, self.page_yscale)
+        canvas.doForm(rl_obj)
+        canvas.restoreState()
+
+
+class MyDocTemplate(BaseDocTemplate):
+    """Used to apply heading to table of contents."""
+
+    def afterFlowable(self, flowable):
+        """Adds Heading1 to table of contents"""
+        if flowable.__class__.__name__ == 'Paragraph':
+            style = flowable.style.name
+            text = flowable.getPlainText()
+            key = '%s' % self.seq.nextf('toc')
+            if style == 'Heading1':
+                self.canv.bookmarkPage(key)
+                self.notify('TOCEntry', [1, text, self.page, key])
+
+
+def create_toc():
+    """Creates the table of contents"""
+    table_of_contents = TableOfContents()
+    table_of_contents.dotsMinLevel = 0
+    header1 = ParagraphStyle(name='Heading1', fontSize=16, leading=16)
+    header2 = ParagraphStyle(name='Heading2', fontSize=14, leading=14)
+    table_of_contents.levelStyles = [header1, header2]
+    return [table_of_contents, PageBreak()]
+
+
+def create_pdf(filename, pdf_template_filename):
+    """Create the pdf, with all the contents"""
+    pdf_report = open(filename, "wb")
+    document = MyDocTemplate(pdf_report)
+    templates = [MyTemplate(pdf_template_filename, name='background')]
+    document.addPageTemplates(templates)
+
+    styles = getSampleStyleSheet()
+    elements = [NextPageTemplate('background')]
+    elements.extend(create_toc())
+
+    # Dummy content (hello world x 200)
+    for i in range(200):
+        elements.append(Paragraph("Hello World" + str(i), styles['Heading1']))
+
+    document.multiBuild(elements)
+    pdf_report.close()
+
+
+if __name__ == '__main__':
+    template, = sys.argv[1:]
+    output = 'platypus_pdf_template.' + os.path.basename(template)
+    create_pdf(output, template)
--- a/examples/rl1/subset.py
+++ b/examples/rl1/subset.py
@ -0,0 +1,42 @@
+#!/usr/bin/env python
+
+'''
+usage:   subset.py my.pdf firstpage lastpage
+
+Creates subset_<pagenum>_to_<pagenum>.my.pdf
+
+
+Uses Form XObjects and reportlab to create output file.
+
+Demonstrates use of pdfrw with reportlab.
+
+'''
+
+import sys
+import os
+
+from reportlab.pdfgen.canvas import Canvas
+
+from pdfrw import PdfReader
+from pdfrw.buildxobj import pagexobj
+from pdfrw.toreportlab import makerl
+
+
+def go(inpfn, firstpage, lastpage):
+    firstpage, lastpage = int(firstpage), int(lastpage)
+    outfn = 'subset.' + os.path.basename(inpfn)
+
+    pages = PdfReader(inpfn).pages
+    pages = [pagexobj(x) for x in pages[firstpage - 1:lastpage]]
+    canvas = Canvas(outfn)
+
+    for page in pages:
+        canvas.setPageSize((page.BBox[2], page.BBox[3]))
+        canvas.doForm(makerl(canvas, page))
+        canvas.showPage()
+
+    canvas.save()
+
+if __name__ == '__main__':
+    inpfn, firstpage, lastpage = sys.argv[1:]
+    go(inpfn, firstpage, lastpage)
--- a/examples/rl2/README.txt
+++ b/examples/rl2/README.txt
@ -0,0 +1,5 @@
+The copy.py demo in this directory parses the graphics stream from the PDF and actually plays it back through reportlab.
+
+Doesn't yet handle fonts or unicode very well.
+
+For a more practical demo, look at the Form XObjects approach in the examples/rl1 directory.
--- a/examples/rl2/copy.py
+++ b/examples/rl2/copy.py
@ -0,0 +1,32 @@
+#!/usr/bin/env python
+
+'''
+usage:   copy.py my.pdf
+
+Creates copy.my.pdf
+
+Uses somewhat-functional parser.  For better results
+for most things, see the Form XObject-based method.
+
+'''
+
+import sys
+import os
+
+from reportlab.pdfgen.canvas import Canvas
+
+from decodegraphics import parsepage
+from pdfrw import PdfReader, PdfWriter, PdfArray
+
+inpfn, = sys.argv[1:]
+outfn = 'copy.' + os.path.basename(inpfn)
+pages = PdfReader(inpfn, decompress=True).pages
+canvas = Canvas(outfn, pageCompression=0)
+
+for page in pages:
+    box = [float(x) for x in page.MediaBox]
+    assert box[0] == box[1] == 0, "demo won't work on this PDF"
+    canvas.setPageSize(box[2:])
+    parsepage(page, canvas)
+    canvas.showPage()
+canvas.save()
--- a/examples/rl2/decodegraphics.py
+++ b/examples/rl2/decodegraphics.py
@ -0,0 +1,457 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+'''
+This file is an example parser that will parse a graphics stream
+into a reportlab canvas.
+
+Needs work on fonts and unicode, but works on a few PDFs.
+
+Better to use Form XObjects for most things (see the example in rl1).
+
+'''
+from inspect import getargspec
+
+from pdfrw import PdfTokens
+from pdfrw.objects import PdfString
+
+#############################################################################
+# Graphics parsing
+
+
+def parse_array(self, token='[', params=None):
+    mylist = []
+    for token in self.tokens:
+        if token == ']':
+            break
+        mylist.append(token)
+    self.params.append(mylist)
+
+
+def parse_savestate(self, token='q', params=''):
+    self.canv.saveState()
+
+
+def parse_restorestate(self, token='Q', params=''):
+    self.canv.restoreState()
+
+
+def parse_transform(self, token='cm', params='ffffff'):
+    self.canv.transform(*params)
+
+
+def parse_linewidth(self, token='w', params='f'):
+    self.canv.setLineWidth(*params)
+
+
+def parse_linecap(self, token='J', params='i'):
+    self.canv.setLineCap(*params)
+
+
+def parse_linejoin(self, token='j', params='i'):
+    self.canv.setLineJoin(*params)
+
+
+def parse_miterlimit(self, token='M', params='f'):
+    self.canv.setMiterLimit(*params)
+
+
+def parse_dash(self, token='d', params='as'):  # Array, string
+    self.canv.setDash(*params)
+
+
+def parse_intent(self, token='ri', params='n'):
+    # TODO: add logging
+    pass
+
+
+def parse_flatness(self, token='i', params='i'):
+    # TODO: add logging
+    pass
+
+
+def parse_gstate(self, token='gs', params='n'):
+    # TODO: add logging
+    # Could parse stuff we care about from here later
+    pass
+
+
+def parse_move(self, token='m', params='ff'):
+    if self.gpath is None:
+        self.gpath = self.canv.beginPath()
+    self.gpath.moveTo(*params)
+    self.current_point = params
+
+
+def parse_line(self, token='l', params='ff'):
+    self.gpath.lineTo(*params)
+    self.current_point = params
+
+
+def parse_curve(self, token='c', params='ffffff'):
+    self.gpath.curveTo(*params)
+    self.current_point = params[-2:]
+
+
+def parse_curve1(self, token='v', params='ffff'):
+    parse_curve(self, token, tuple(self.current_point) + tuple(params))
+
+
+def parse_curve2(self, token='y', params='ffff'):
+    parse_curve(self, token, tuple(params) + tuple(params[-2:]))
+
+
+def parse_close(self, token='h', params=''):
+    self.gpath.close()
+
+
+def parse_rect(self, token='re', params='ffff'):
+    if self.gpath is None:
+        self.gpath = self.canv.beginPath()
+    self.gpath.rect(*params)
+    self.current_point = params[-2:]
+
+
+def parse_stroke(self, token='S', params=''):
+    finish_path(self, 1, 0, 0)
+
+
+def parse_close_stroke(self, token='s', params=''):
+    self.gpath.close()
+    finish_path(self, 1, 0, 0)
+
+
+def parse_fill(self, token='f', params=''):
+    finish_path(self, 0, 1, 1)
+
+
+def parse_fill_compat(self, token='F', params=''):
+    finish_path(self, 0, 1, 1)
+
+
+def parse_fill_even_odd(self, token='f*', params=''):
+    finish_path(self, 0, 1, 0)
+
+
+def parse_fill_stroke_even_odd(self, token='B*', params=''):
+    finish_path(self, 1, 1, 0)
+
+
+def parse_fill_stroke(self, token='B', params=''):
+    finish_path(self, 1, 1, 1)
+
+
+def parse_close_fill_stroke_even_odd(self, token='b*', params=''):
+    self.gpath.close()
+    finish_path(self, 1, 1, 0)
+
+
+def parse_close_fill_stroke(self, token='b', params=''):
+    self.gpath.close()
+    finish_path(self, 1, 1, 1)
+
+
+def parse_nop(self, token='n', params=''):
+    finish_path(self, 0, 0, 0)
+
+
+def finish_path(self, stroke, fill, fillmode):
+    if self.gpath is not None:
+        canv = self.canv
+        canv._fillMode, oldmode = fillmode, canv._fillMode
+        canv.drawPath(self.gpath, stroke, fill)
+        canv._fillMode = oldmode
+        self.gpath = None
+
+
+def parse_clip_path(self, token='W', params=''):
+    # TODO: add logging
+    pass
+
+
+def parse_clip_path_even_odd(self, token='W*', params=''):
+    # TODO: add logging
+    pass
+
+
+def parse_stroke_gray(self, token='G', params='f'):
+    self.canv.setStrokeGray(*params)
+
+
+def parse_fill_gray(self, token='g', params='f'):
+    self.canv.setFillGray(*params)
+
+
+def parse_stroke_rgb(self, token='RG', params='fff'):
+    self.canv.setStrokeColorRGB(*params)
+
+
+def parse_fill_rgb(self, token='rg', params='fff'):
+    self.canv.setFillColorRGB(*params)
+
+
+def parse_stroke_cmyk(self, token='K', params='ffff'):
+    self.canv.setStrokeColorCMYK(*params)
+
+
+def parse_fill_cmyk(self, token='k', params='ffff'):
+    self.canv.setFillColorCMYK(*params)
+
+#############################################################################
+# Text parsing
+
+
+def parse_begin_text(self, token='BT', params=''):
+    assert self.tpath is None
+    self.tpath = self.canv.beginText()
+
+
+def parse_text_transform(self, token='Tm', params='ffffff'):
+    path = self.tpath
+
+    # Stoopid optimization to remove nop
+    try:
+        code = path._code
+    except AttributeError:
+        pass
+    else:
+        if code[-1] == '1 0 0 1 0 0 Tm':
+            code.pop()
+
+    path.setTextTransform(*params)
+
+
+def parse_setfont(self, token='Tf', params='nf'):
+    fontinfo = self.fontdict[params[0]]
+    self.tpath._setFont(fontinfo.name, params[1])
+    self.curfont = fontinfo
+
+
+def parse_text_out(self, token='Tj', params='t'):
+    text = params[0].decode(self.curfont.remap, self.curfont.twobyte)
+    self.tpath.textOut(text)
+
+def parse_lf_text_out(self, token="'", params='t'):
+    self.tpath.textLine()
+    text = params[0].decode(self.curfont.remap, self.curfont.twobyte)
+    self.tpath.textOut(text)
+
+
+def parse_lf_text_out_with_spacing(self, token='"', params='fft'):
+    self.tpath.setWordSpace(params[0])
+    self.tpath.setCharSpace(params[1])
+    self.tpath.textLine()
+    text = params[2].decode(self.curfont.remap, self.curfont.twobyte)
+    self.tpath.textOut(text)
+
+
+def parse_TJ(self, token='TJ', params='a'):
+    remap = self.curfont.remap
+    twobyte = self.curfont.twobyte
+    result = []
+    for x in params[0]:
+        if isinstance(x, PdfString):
+            result.append(x.decode(remap, twobyte))
+        else:
+            # TODO: Adjust spacing between characters here
+            int(x)
+    text = ''.join(result)
+    self.tpath.textOut(text)
+
+
+def parse_end_text(self, token='ET', params=''):
+    assert self.tpath is not None
+    self.canv.drawText(self.tpath)
+    self.tpath = None
+
+
+def parse_move_cursor(self, token='Td', params='ff'):
+    self.tpath.moveCursor(params[0], -params[1])
+
+
+def parse_set_leading(self, token='TL', params='f'):
+    self.tpath.setLeading(*params)
+
+
+def parse_text_line(self, token='T*', params=''):
+    self.tpath.textLine()
+
+
+def parse_set_char_space(self, token='Tc', params='f'):
+    self.tpath.setCharSpace(*params)
+
+
+def parse_set_word_space(self, token='Tw', params='f'):
+    self.tpath.setWordSpace(*params)
+
+
+def parse_set_hscale(self, token='Tz', params='f'):
+    self.tpath.setHorizScale(params[0] - 100)
+
+
+def parse_set_rise(self, token='Ts', params='f'):
+    self.tpath.setRise(*params)
+
+
+def parse_xobject(self, token='Do', params='n'):
+    # TODO: Need to do this
+    pass
+
+
+class FontInfo(object):
+    ''' Pretty basic -- needs a lot of work to work right for all fonts
+    '''
+    lookup = {
+                # WRONG -- have to learn about font stuff...
+                'BitstreamVeraSans': 'Helvetica',
+             }
+
+    def __init__(self, source):
+        name = source.BaseFont[1:]
+        self.name = self.lookup.get(name, name)
+        self.remap = chr
+        self.twobyte = False
+        info = source.ToUnicode
+        if not info:
+            return
+        info = info.stream.split('beginbfchar')[1].split('endbfchar')[0]
+        info = list(PdfTokens(info))
+        assert not len(info) & 1
+        info2 = []
+        for x in info:
+            assert x[0] == '<' and x[-1] == '>' and len(x) in (4, 6), x
+            i = int(x[1:-1], 16)
+            info2.append(i)
+        self.remap = dict((x, chr(y)) for (x, y) in
+                          zip(info2[::2], info2[1::2])).get
+        self.twobyte = len(info[0]) > 4
+
+#############################################################################
+# Control structures
+
+
+def findparsefuncs():
+
+    def checkname(n):
+        assert n.startswith('/')
+        return n
+
+    def checkarray(a):
+        assert isinstance(a, list), a
+        return a
+
+    def checktext(t):
+        assert isinstance(t, PdfString)
+        return t
+
+    fixparam = dict(f=float, i=int, n=checkname, a=checkarray,
+                    s=str, t=checktext)
+    fixcache = {}
+
+    def fixlist(params):
+        try:
+            result = fixcache[params]
+        except KeyError:
+            result = tuple(fixparam[x] for x in params)
+            fixcache[params] = result
+        return result
+
+    dispatch = {}
+    expected_args = 'self token params'.split()
+    for key, func in globals().items():
+        if key.startswith('parse_'):
+            args, varargs, keywords, defaults = getargspec(func)
+            assert (args == expected_args and varargs is None and
+                    keywords is None and len(defaults) == 2), (
+                    key, args, varargs, keywords, defaults)
+            token, params = defaults
+            if params is not None:
+                params = fixlist(params)
+            value = func, params
+            assert dispatch.setdefault(token, value) is value, repr(token)
+    return dispatch
+
+
+class _ParseClass(object):
+    dispatch = findparsefuncs()
+
+    @classmethod
+    def parsepage(cls, page, canvas=None):
+        self = cls()
+        contents = page.Contents
+        if contents.Filter is not None:
+            raise SystemExit('Cannot parse graphics -- page encoded with %s'
+                             % contents.Filter)
+        dispatch = cls.dispatch.get
+        self.tokens = tokens = iter(PdfTokens(contents.stream))
+        self.params = params = []
+        self.canv = canvas
+        self.gpath = None
+        self.tpath = None
+        self.fontdict = dict((x, FontInfo(y)) for
+                             (x, y) in page.Resources.Font.items())
+
+        for token in self.tokens:
+            info = dispatch(token)
+            if info is None:
+                params.append(token)
+                continue
+            func, paraminfo = info
+            if paraminfo is None:
+                func(self, token, ())
+                continue
+            delta = len(params) - len(paraminfo)
+            if delta:
+                if delta < 0:
+                    print ('Operator %s expected %s parameters, got %s' %
+                           (token, len(paraminfo), params))
+                    params[:] = []
+                    continue
+                else:
+                    print ("Unparsed parameters/commands: %s" % params[:delta])
+                del params[:delta]
+            paraminfo = zip(paraminfo, params)
+            try:
+                params[:] = [x(y) for (x, y) in paraminfo]
+            except:
+                for i, (x, y) in enumerate(paraminfo):
+                    try:
+                        x(y)
+                    except:
+                        raise  # For now
+                    continue
+            func(self, token, params)
+            params[:] = []
+
+
+def debugparser(undisturbed=set('parse_array'.split())):
+    def debugdispatch():
+        def getvalue(oldval):
+            name = oldval[0].__name__
+
+            def myfunc(self, token, params):
+                print ('%s called %s(%s)' % (token, name,
+                       ', '.join(str(x) for x in params)))
+            if name in undisturbed:
+                myfunc = oldval[0]
+            return myfunc, oldval[1]
+        return dict((x, getvalue(y))
+                    for (x, y) in _ParseClass.dispatch.items())
+
+    class _DebugParse(_ParseClass):
+        dispatch = debugdispatch()
+
+    return _DebugParse.parsepage
+
+parsepage = _ParseClass.parsepage
+
+if __name__ == '__main__':
+    import sys
+    from pdfrw import PdfReader
+    parse = debugparser()
+    fname, = sys.argv[1:]
+    pdf = PdfReader(fname, decompress=True)
+    for i, page in enumerate(pdf.pages):
+        print ('\nPage %s ------------------------------------' % i)
+        parse(page)
--- a/examples/rotate.py
+++ b/examples/rotate.py
@ -0,0 +1,41 @@
+#!/usr/bin/env python
+
+'''
+usage:   rotate.py my.pdf rotation [page[range] ...]
+         eg. rotate.py 270 1-3 5 7-9
+
+        Rotation must be multiple of 90 degrees, clockwise.
+
+Creates rotate.my.pdf with selected pages rotated.  Rotates all by default.
+
+'''
+
+import sys
+import os
+
+from pdfrw import PdfReader, PdfWriter
+
+inpfn = sys.argv[1]
+rotate = sys.argv[2]
+ranges = sys.argv[3:]
+
+rotate = int(rotate)
+assert rotate % 90 == 0
+
+ranges = [[int(y) for y in x.split('-')] for x in ranges]
+outfn = 'rotate.%s' % os.path.basename(inpfn)
+trailer = PdfReader(inpfn)
+pages = trailer.pages
+
+if not ranges:
+    ranges = [[1, len(pages)]]
+
+for onerange in ranges:
+    onerange = (onerange + onerange[-1:])[:2]
+    for pagenum in range(onerange[0]-1, onerange[1]):
+        pages[pagenum].Rotate = (int(pages[pagenum].inheritable.Rotate or
+                                     0) + rotate) % 360
+
+outdata = PdfWriter(outfn)
+outdata.trailer = trailer
+outdata.write()
--- a/examples/subset.py
+++ b/examples/subset.py
@ -0,0 +1,29 @@
+#!/usr/bin/env python
+
+'''
+usage:   subset.py my.pdf page[range] [page[range]] ...
+         eg. subset.py 1-3 5 7-9
+
+Creates subset.my.pdf
+
+'''
+
+import sys
+import os
+
+from pdfrw import PdfReader, PdfWriter
+
+inpfn = sys.argv[1]
+ranges = sys.argv[2:]
+assert ranges, "Expected at least one range"
+
+ranges = ([int(y) for y in x.split('-')] for x in ranges)
+outfn = 'subset.%s' % os.path.basename(inpfn)
+pages = PdfReader(inpfn).pages
+outdata = PdfWriter(outfn)
+
+for onerange in ranges:
+    onerange = (onerange + onerange[-1:])[:2]
+    for pagenum in range(onerange[0], onerange[1]+1):
+        outdata.addpage(pages[pagenum-1])
+outdata.write()
--- a/examples/subset_booklets.py
+++ b/examples/subset_booklets.py
@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+'''
+usage: subset_booklets.py my.pdf
+
+Creates subset_booklets.my.pdf
+
+Pages organized in a form suitable for booklet printing, e.g.
+to print 4 8.5x11 pages using a single 11x17 sheet (double-sided).
+Instead of a large booklet, the pdf is divided into several mini
+booklets. The reason is: professional printing works this way:
+    - Print all of several mini booklets(subsets of booklet);
+    - Saw each mini booklet individually;
+    - glue them all together;
+    - Insert the cover.
+
+    Take a look at http://www.wikihow.com/Bind-a-Book
+'''
+
+import sys
+import os
+import time
+from pdfrw import PdfReader, PdfWriter, PageMerge
+
+BOOKLET_SIZE = 20
+START = time.time()
+
+def fixpage(*pages):
+    result = PageMerge() + (x for x in pages if x is not None)
+    result[-1].x += result[0].w
+    return result.render()
+
+INPFN, = sys.argv[1:]
+OUTFN = 'booklet.' + os.path.basename(INPFN)
+ALL_IPAGES = PdfReader(INPFN).pages
+print 'The pdf file '+str(INPFN)+' has '+str(len(ALL_IPAGES))+' pages.'
+
+#Make sure we have an even number
+if len(ALL_IPAGES) & 1:
+    ALL_IPAGES.append(None)
+    print 'Inserting one more blank page to make pages number even.'
+NUM_OF_ITER, ITERS_LEFT = divmod(len(ALL_IPAGES), BOOKLET_SIZE)
+
+print 'Making '+str(NUM_OF_ITER)+' subbooklets of '+str(BOOKLET_SIZE)+' pages each.'
+opages = []
+for iteration in range(0, NUM_OF_ITER):
+    ipages = ALL_IPAGES[iteration*BOOKLET_SIZE:(iteration+1)*BOOKLET_SIZE]
+    while len(ipages) > 2:
+        opages.append(fixpage(ipages.pop(), ipages.pop(0)))
+        opages.append(fixpage(ipages.pop(0), ipages.pop()))
+
+# Making one more subbooklet with the left pages
+ipages = ALL_IPAGES[len(ALL_IPAGES)-ITERS_LEFT:len(ALL_IPAGES)]
+while len(ipages) > 2:
+    opages.append(fixpage(ipages.pop(), ipages.pop(0)))
+    opages.append(fixpage(ipages.pop(0), ipages.pop()))
+if len(ipages) >= 1:
+    opages.append(fixpage(ipages.pop(), ipages.pop(0)))
+
+PdfWriter(OUTFN).addpages(opages).write()
+print 'It took '+ str(round(time.time()-START, 2))+' seconds to make the pdf subbooklets changes.'
--- a/examples/unspread.py
+++ b/examples/unspread.py
@ -0,0 +1,32 @@
+#!/usr/bin/env python
+
+'''
+usage:   unspread.py my.pdf
+
+Creates unspread.my.pdf
+
+Chops each page in half, e.g. if a source were
+created in booklet form, you could extract individual
+pages.
+'''
+
+import sys
+import os
+
+from pdfrw import PdfReader, PdfWriter, PageMerge
+
+
+def splitpage(src):
+    ''' Split a page into two (left and right)
+    '''
+    # Yield a result for each half of the page
+    for x_pos in (0, 0.5):
+        yield PageMerge().add(src, viewrect=(x_pos, 0, 0.5, 1)).render()
+
+
+inpfn, = sys.argv[1:]
+outfn = 'unspread.' + os.path.basename(inpfn)
+writer = PdfWriter(outfn)
+for page in PdfReader(inpfn).pages:
+    writer.addpages(splitpage(page))
+writer.write()
--- a/examples/watermark.py
+++ b/examples/watermark.py
@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+'''
+Simple example of watermarking using form xobjects (pdfrw).
+
+usage:   watermark.py [-u] my.pdf single_page.pdf
+
+Creates watermark.my.pdf, with every page overlaid with
+first page from single_page.pdf.  If -u is selected, watermark
+will be placed underneath page (painted first).
+
+NOTE 1: This program assumes that all pages (including the watermark
+        page) are the same size.  For other possibilities, see
+        the fancy_watermark.py example.
+
+NOTE 2: At one point, this example was extremely complicated, with
+        multiple options.  That only led to errors in implementation,
+        so it has been re-simplified in order to show basic principles
+        of the library operation and to match the other examples better.
+'''
+
+import sys
+import os
+
+from pdfrw import PdfReader, PdfWriter, PageMerge
+
+argv = sys.argv[1:]
+underneath = '-u' in argv
+if underneath:
+    del argv[argv.index('-u')]
+inpfn, wmarkfn = argv
+outfn = 'watermark.' + os.path.basename(inpfn)
+wmark = PageMerge().add(PdfReader(wmarkfn).pages[0])[0]
+trailer = PdfReader(inpfn)
+for page in trailer.pages:
+    PageMerge(page).add(wmark, prepend=underneath).render()
+PdfWriter(outfn, trailer=trailer).write()
--- a/pdfrw/init.py
+++ b/pdfrw/init.py
@ -0,0 +1,23 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+from .pdfwriter import PdfWriter
+from .pdfreader import PdfReader
+from .objects import (PdfObject, PdfName, PdfArray,
+                      PdfDict, IndirectPdfDict, PdfString)
+from .tokens import PdfTokens
+from .errors import PdfParseError
+from .pagemerge import PageMerge
+
+__version__ = '0.4'
+
+# Add a tiny bit of compatibility to pyPdf
+
+PdfFileReader = PdfReader
+PdfFileWriter = PdfWriter
+
+__all__ = """PdfWriter PdfReader PdfObject PdfName PdfArray
+             PdfTokens PdfParseError PdfDict IndirectPdfDict
+             PdfString PageMerge""".split()
+
--- a/pdfrw/buildxobj.py
+++ b/pdfrw/buildxobj.py
@ -0,0 +1,363 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+'''
+
+This module contains code to build PDF "Form XObjects".
+
+A Form XObject allows a fragment from one PDF file to be cleanly
+included in another PDF file.
+
+Reference for syntax: "Parameters for opening PDF files" from SDK 8.1
+
+        http://www.adobe.com/devnet/acrobat/pdfs/pdf_open_parameters.pdf
+
+        supported 'page=xxx', 'viewrect=<left>,<top>,<width>,<height>'
+
+        Also supported by this, but not by Adobe:
+            'rotate=xxx'  where xxx in [0, 90, 180, 270]
+
+        Units are in points
+
+
+Reference for content:   Adobe PDF reference, sixth edition, version 1.7
+
+        http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf
+
+        Form xobjects discussed chapter 4.9, page 355
+'''
+
+from .objects import PdfDict, PdfArray, PdfName
+from .pdfreader import PdfReader
+from .errors import log, PdfNotImplementedError
+from .py23_diffs import iteritems
+from .uncompress import uncompress
+from .compress import compress
+
+
+class ViewInfo(object):
+    ''' Instantiate ViewInfo with a uri, and it will parse out
+        the filename, page, and viewrect into object attributes.
+
+        Note 1:
+            Viewrects follow the adobe definition.  (See reference
+            above). They are arrays of 4 numbers:
+
+            - Distance from left of document in points
+            - Distance from top (NOT bottom) of document in points
+            - Width of rectangle in points
+            - Height of rectangle in points
+
+        Note 2:
+            For simplicity, Viewrects can also be specified
+            in fractions of the document.  If every number in
+            the viewrect is between 0 and 1 inclusive, then
+            viewrect elements 0 and 2 are multiplied by the
+            mediabox width before use, and viewrect elements
+            1 and 3 are multiplied by the mediabox height before
+            use.
+
+        Note 3:
+            By default, an XObject based on the view will be
+            cacheable.  It should not be cacheable if the XObject
+            will be subsequently modified.
+    '''
+    doc = None
+    docname = None
+    page = None
+    viewrect = None
+    rotate = None
+    cacheable = True
+
+    def __init__(self, pageinfo='', **kw):
+        pageinfo = pageinfo.split('#', 1)
+        if len(pageinfo) == 2:
+            pageinfo[1:] = pageinfo[1].replace('&', '#').split('#')
+        for key in 'page viewrect'.split():
+            if pageinfo[0].startswith(key + '='):
+                break
+        else:
+            self.docname = pageinfo.pop(0)
+        for item in pageinfo:
+            key, value = item.split('=')
+            key = key.strip()
+            value = value.replace(',', ' ').split()
+            if key in ('page', 'rotate'):
+                assert len(value) == 1
+                setattr(self, key, int(value[0]))
+            elif key == 'viewrect':
+                assert len(value) == 4
+                setattr(self, key, [float(x) for x in value])
+            else:
+                log.error('Unknown option: %s', key)
+        for key, value in iteritems(kw):
+            assert hasattr(self, key), key
+            setattr(self, key, value)
+
+
+def get_rotation(rotate):
+    ''' Return clockwise rotation code:
+          0 = unrotated
+          1 = 90 degrees
+          2 = 180 degrees
+          3 = 270 degrees
+    '''
+    try:
+        rotate = int(rotate)
+    except (ValueError, TypeError):
+        return 0
+    if rotate % 90 != 0:
+        return 0
+    return rotate // 90
+
+
+def rotate_point(point, rotation):
+    ''' Rotate an (x,y) coordinate clockwise by a
+        rotation code specifying a multiple of 90 degrees.
+    '''
+    if rotation & 1:
+        point = point[1], -point[0]
+    if rotation & 2:
+        point = -point[0], -point[1]
+    return point
+
+
+def rotate_rect(rect, rotation):
+    ''' Rotate both points within the rectangle, then normalize
+        the rectangle by returning the new lower left, then new
+        upper right.
+    '''
+    rect = rotate_point(rect[:2], rotation) + rotate_point(rect[2:], rotation)
+    return (min(rect[0], rect[2]), min(rect[1], rect[3]),
+            max(rect[0], rect[2]), max(rect[1], rect[3]))
+
+
+def getrects(inheritable, pageinfo, rotation):
+    ''' Given the inheritable attributes of a page and
+        the desired pageinfo rectangle, return the page's
+        media box and the calculated boundary (clip) box.
+    '''
+    mbox = tuple([float(x) for x in inheritable.MediaBox])
+    cbox = tuple([float(x) for x in (inheritable.CropBox or mbox)])
+    vrect = pageinfo.viewrect
+    if vrect is not None:
+        # Rotate the media box to match what the user sees,
+        # figure out the clipping box, then rotate back
+        mleft, mbot, mright, mtop = rotate_rect(cbox, rotation)
+        x, y, w, h = vrect
+
+        # Support operations in fractions of a page
+        if 0 <= min(vrect) < max(vrect) <= 1:
+            mw = mright - mleft
+            mh = mtop - mbot
+            x *= mw
+            w *= mw
+            y *= mh
+            h *= mh
+
+        cleft = mleft + x
+        ctop = mtop - y
+        cright = cleft + w
+        cbot = ctop - h
+        cbox = (max(mleft, cleft), max(mbot, cbot),
+                min(mright, cright), min(mtop, ctop))
+        cbox = rotate_rect(cbox, -rotation)
+    return mbox, cbox
+
+
+def _build_cache(contents, allow_compressed):
+    ''' Build a new dictionary holding the stream,
+        and save it along with private cache info.
+        Assumes validity has been pre-checked if
+        we have a non-None xobj_copy.
+
+        Also, the spec says nothing about nested arrays,
+        so we assume those don't exist until we see one
+        in the wild.
+    '''
+    try:
+        xobj_copy = contents.xobj_copy
+    except AttributeError:
+        # Should have a PdfArray here...
+        array = contents
+        private = contents
+    else:
+        # Should have a PdfDict here -- might or might not have cache copy
+        if xobj_copy is not None:
+            return xobj_copy
+        array = [contents]
+        private = contents.private
+
+    # If we don't allow compressed objects, OR if we have multiple compressed
+    # objects, we try to decompress them, and fail if we cannot do that.
+
+    if not allow_compressed or len(array) > 1:
+        keys = set(x[0] for cdict in array for x in iteritems(cdict))
+        was_compressed = len(keys) > 1
+        if was_compressed:
+            # Make copies of the objects before we uncompress them.
+            array = [PdfDict(x) for x in array]
+            if not uncompress(array):
+                raise PdfNotImplementedError(
+                    'Xobjects with these compression parameters not supported: %s' %
+                    keys)
+    
+    xobj_copy = PdfDict(array[0])
+    xobj_copy.private.xobj_cachedict = {}
+    private.xobj_copy = xobj_copy
+
+    if len(array) > 1:
+        newstream = '\n'.join(x.stream for x in array)
+        newlength = sum(int(x.Length) for x in array) + len(array) - 1
+        assert newlength == len(newstream)
+        xobj_copy.stream = newstream
+        if was_compressed and allow_compressed:
+            compress(xobj_copy)
+
+    return xobj_copy
+
+
+def _cache_xobj(contents, resources, mbox, bbox, rotation, cacheable=True):
+    ''' Return a cached Form XObject, or create a new one and cache it.
+        Adds private members x, y, w, h
+    '''
+    cachedict = contents.xobj_cachedict
+    cachekey = mbox, bbox, rotation
+    result = cachedict.get(cachekey) if cacheable else None
+    if result is None:
+        # If we are not getting a full page, or if we are going to
+        # modify the results, first retrieve an underlying Form XObject
+        # that represents the entire page, so that we are not copying
+        # the full page data into the new file multiple times
+        func = (_get_fullpage, _get_subpage)[mbox != bbox or not cacheable]
+        result = PdfDict(
+            func(contents, resources, mbox),
+            Type=PdfName.XObject,
+            Subtype=PdfName.Form,
+            FormType=1,
+            BBox=PdfArray(bbox),
+        )
+        rect = bbox
+        if rotation:
+            matrix = (rotate_point((1, 0), rotation) +
+                      rotate_point((0, 1), rotation))
+            result.Matrix = PdfArray(matrix + (0, 0))
+            rect = rotate_rect(rect, rotation)
+
+        private = result.private
+        private.x = rect[0]
+        private.y = rect[1]
+        private.w = rect[2] - rect[0]
+        private.h = rect[3] - rect[1]
+        if cacheable:
+            cachedict[cachekey] = result
+    return result
+
+
+def _get_fullpage(contents, resources, mbox):
+    ''' fullpage is easy.  Just copy the contents,
+        set up the resources, and let _cache_xobj handle the
+        rest.
+    '''
+    return PdfDict(contents, Resources=resources)
+
+
+def _get_subpage(contents, resources, mbox):
+    ''' subpages *could* be as easy as full pages, but we
+        choose to complicate life by creating a Form XObject
+        for the page, and then one that references it for
+        the subpage, on the off-chance that we want multiple
+        items from the page.
+    '''
+    return PdfDict(
+        stream='/FullPage Do\n',
+        Resources=PdfDict(
+            XObject=PdfDict(
+                FullPage=_cache_xobj(contents, resources, mbox, mbox, 0)
+            )
+        )
+    )
+
+
+def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True):
+    ''' pagexobj creates and returns a Form XObject for
+        a given view within a page (Defaults to entire page.)
+
+        pagexobj is passed a page and a viewrect.
+    '''
+    inheritable = page.inheritable
+    resources = inheritable.Resources
+    rotation = get_rotation(inheritable.Rotate)
+    mbox, bbox = getrects(inheritable, viewinfo, rotation)
+    rotation += get_rotation(viewinfo.rotate)
+    contents = _build_cache(page.Contents, allow_compressed)
+    return _cache_xobj(contents, resources, mbox, bbox, rotation,
+                       viewinfo.cacheable)
+
+
+def docxobj(pageinfo, doc=None, allow_compressed=True):
+    ''' docinfo reads a page out of a document and uses
+        pagexobj to create the Form XObject based on
+        the page.
+
+        This is a convenience function for things like
+        rst2pdf that want to be able to pass in textual
+        filename/location descriptors and don't want to
+        know about using PdfReader.
+
+        Can work standalone, or in conjunction with
+        the CacheXObj class (below).
+
+    '''
+    if not isinstance(pageinfo, ViewInfo):
+        pageinfo = ViewInfo(pageinfo)
+
+    # If we're explicitly passed a document,
+    # make sure we don't have one implicitly as well.
+    # If no implicit or explicit doc, then read one in
+    # from the filename.
+    if doc is not None:
+        assert pageinfo.doc is None
+        pageinfo.doc = doc
+    elif pageinfo.doc is not None:
+        doc = pageinfo.doc
+    else:
+        doc = pageinfo.doc = PdfReader(pageinfo.docname,
+                                       decompress=not allow_compressed)
+    assert isinstance(doc, PdfReader)
+
+    sourcepage = doc.pages[(pageinfo.page or 1) - 1]
+    return pagexobj(sourcepage, pageinfo, allow_compressed)
+
+
+class CacheXObj(object):
+    ''' Use to keep from reparsing files over and over,
+        and to keep from making the output too much
+        bigger than it ought to be by replicating
+        unnecessary object copies.
+
+        This is a convenience function for things like
+        rst2pdf that want to be able to pass in textual
+        filename/location descriptors and don't want to
+        know about using PdfReader.
+    '''
+    def __init__(self, decompress=False):
+        ''' Set decompress true if you need
+            the Form XObjects to be decompressed.
+            Will decompress what it can and scream
+            about the rest.
+        '''
+        self.cached_pdfs = {}
+        self.decompress = decompress
+
+    def load(self, sourcename):
+        ''' Load a Form XObject from a uri
+        '''
+        info = ViewInfo(sourcename)
+        fname = info.docname
+        pcache = self.cached_pdfs
+        doc = pcache.get(fname)
+        if doc is None:
+            doc = pcache[fname] = PdfReader(fname, decompress=self.decompress)
+        return docxobj(info, doc, allow_compressed=not self.decompress)
--- a/pdfrw/compress.py
+++ b/pdfrw/compress.py
@ -0,0 +1,27 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+'''
+Currently, this sad little file only knows how to compress
+using the flate (zlib) algorithm.  Maybe more later, but it's
+not a priority for me...
+'''
+
+from .objects import PdfName
+from .uncompress import streamobjects
+from .py23_diffs import zlib, convert_load, convert_store
+
+
+def compress(mylist):
+    flate = PdfName.FlateDecode
+    for obj in streamobjects(mylist):
+        ftype = obj.Filter
+        if ftype is not None:
+            continue
+        oldstr = obj.stream
+        newstr = convert_load(zlib.compress(convert_store(oldstr)))
+        if len(newstr) < len(oldstr) + 30:
+            obj.stream = newstr
+            obj.Filter = flate
+            obj.DecodeParms = None
--- a/pdfrw/crypt.py
+++ b/pdfrw/crypt.py
@ -0,0 +1,150 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2017  Jon Lund Steffensen
+# MIT license -- See LICENSE.txt for details
+
+from __future__ import division
+
+import hashlib
+import struct
+
+try:
+    from Crypto.Cipher import ARC4, AES
+    HAS_CRYPTO = True
+except ImportError:
+    HAS_CRYPTO = False
+
+from .objects import PdfDict, PdfName
+
+_PASSWORD_PAD = (
+    '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08'
+    '..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz')
+
+
+def streamobjects(mylist, isinstance=isinstance, PdfDict=PdfDict):
+    for obj in mylist:
+        if isinstance(obj, PdfDict) and obj.stream is not None:
+            yield obj
+
+
+def create_key(password, doc):
+    """Create an encryption key (Algorithm 2 in PDF spec)."""
+    key_size = int(doc.Encrypt.Length or 40) // 8
+    padded_pass = (password + _PASSWORD_PAD)[:32]
+    hasher = hashlib.md5()
+    hasher.update(padded_pass)
+    hasher.update(doc.Encrypt.O.to_bytes())
+    hasher.update(struct.pack('<i', int(doc.Encrypt.P)))
+    hasher.update(doc.ID[0].to_bytes())
+    temp_hash = hasher.digest()
+
+    if int(doc.Encrypt.R or 0) >= 3:
+        for _ in range(50):
+            temp_hash = hashlib.md5(temp_hash[:key_size]).digest()
+
+    return temp_hash[:key_size]
+
+
+def create_user_hash(key, doc):
+    """Create the user password hash (Algorithm 4/5)."""
+    revision = int(doc.Encrypt.R or 0)
+    if revision < 3:
+        cipher = ARC4.new(key)
+        return cipher.encrypt(_PASSWORD_PAD)
+    else:
+        hasher = hashlib.md5()
+        hasher.update(_PASSWORD_PAD)
+        hasher.update(doc.ID[0].to_bytes())
+        temp_hash = hasher.digest()
+
+        for i in range(20):
+            temp_key = ''.join(chr(i ^ ord(x)) for x in key)
+            cipher = ARC4.new(temp_key)
+            temp_hash = cipher.encrypt(temp_hash)
+
+        return temp_hash
+
+
+def check_user_password(key, doc):
+    """Check that the user password is correct (Algorithm 6)."""
+    expect_user_hash = create_user_hash(key, doc)
+    revision = int(doc.Encrypt.R or 0)
+    if revision < 3:
+        return doc.Encrypt.U.to_bytes() == expect_user_hash
+    else:
+        return doc.Encrypt.U.to_bytes()[:16] == expect_user_hash
+
+
+class AESCryptFilter(object):
+    """Crypt filter corresponding to /AESV2."""
+    def __init__(self, key):
+        self._key = key
+
+    def decrypt_data(self, num, gen, data):
+        """Decrypt data (string/stream) using key (Algorithm 1)."""
+        key_extension = struct.pack('<i', num)[:3]
+        key_extension += struct.pack('<i', gen)[:2]
+        key_extension += 'sAlT'
+        temp_key = self._key + key_extension
+        temp_key = hashlib.md5(temp_key).digest()
+
+        iv = data[:AES.block_size]
+        cipher = AES.new(temp_key, AES.MODE_CBC, iv)
+        decrypted = cipher.decrypt(data[AES.block_size:])
+
+        # Remove padding
+        pad_size = ord(decrypted[-1])
+        assert 1 <= pad_size <= 16
+        return decrypted[:-pad_size]
+
+
+class RC4CryptFilter(object):
+    """Crypt filter corresponding to /V2."""
+    def __init__(self, key):
+        self._key = key
+
+    def decrypt_data(self, num, gen, data):
+        """Decrypt data (string/stream) using key (Algorithm 1)."""
+        new_key_size = min(len(self._key) + 5, 16)
+        key_extension = struct.pack('<i', num)[:3]
+        key_extension += struct.pack('<i', gen)[:2]
+        temp_key = self._key + key_extension
+        temp_key = hashlib.md5(temp_key).digest()[:new_key_size]
+
+        cipher = ARC4.new(temp_key)
+        return cipher.decrypt(data)
+
+
+class IdentityCryptFilter(object):
+    """Identity crypt filter (pass through with no encryption)."""
+    def decrypt_data(self, num, gen, data):
+        return data
+
+
+def decrypt_objects(objects, default_filter, filters):
+    """Decrypt list of stream objects.
+
+    The parameter default_filter specifies the default filter to use. The
+    filters parameter is a dictionary of alternate filters to use when the
+    object specfies an alternate filter locally.
+    """
+    for obj in streamobjects(objects):
+        if getattr(obj, 'decrypted', False):
+            continue
+
+        filter = default_filter
+
+        # Check whether a locally defined crypt filter should override the
+        # default filter.
+        ftype = obj.Filter
+        if ftype is not None:
+            if not isinstance(ftype, list):
+                ftype = [ftype]
+            if len(ftype) >= 1 and ftype[0] == PdfName.Crypt:
+                ftype = ftype[1:]
+                parms = obj.DecodeParms or obj.DP
+                filter = filters[parms.Name]
+
+        num, gen = obj.indirect
+        obj.stream = filter.decrypt_data(num, gen, obj.stream)
+        obj.private.decrypted = True
+        obj.Filter = ftype or None
--- a/pdfrw/errors.py
+++ b/pdfrw/errors.py
@ -0,0 +1,41 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+'''
+PDF Exceptions and error handling
+'''
+
+import logging
+
+
+fmt = logging.Formatter('[%(levelname)s] %(filename)s:%(lineno)d %(message)s')
+
+handler = logging.StreamHandler()
+handler.setFormatter(fmt)
+
+log = logging.getLogger('pdfrw')
+log.setLevel(logging.WARNING)
+log.addHandler(handler)
+
+
+class PdfError(Exception):
+    "Abstract base class of exceptions thrown by this module"
+
+    def __init__(self, msg):
+        self.msg = msg
+
+    def __str__(self):
+        return self.msg
+
+
+class PdfParseError(PdfError):
+    "Error thrown by parser/tokenizer"
+
+
+class PdfOutputError(PdfError):
+    "Error thrown by PDF writer"
+
+
+class PdfNotImplementedError(PdfError):
+    "Error thrown on missing features"
--- a/pdfrw/findobjs.py
+++ b/pdfrw/findobjs.py
@ -0,0 +1,137 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+''' This module contains a function to find all the XObjects
+    in a document, and another function that will wrap them
+    in page objects.
+'''
+
+from .objects import PdfDict, PdfArray, PdfName
+
+
+def find_objects(source, valid_types=(PdfName.XObject, None),
+                 valid_subtypes=(PdfName.Form, PdfName.Image),
+                 no_follow=(PdfName.Parent,),
+                 isinstance=isinstance, id=id, sorted=sorted,
+                 reversed=reversed, PdfDict=PdfDict):
+    '''
+        Find all the objects of a particular kind in a document
+        or array.  Defaults to looking for Form and Image XObjects.
+
+        This could be done recursively, but some PDFs
+        are quite deeply nested, so we do it without
+        recursion.
+
+        Note that we don't know exactly where things appear on pages,
+        but we aim for a sort order that is (a) mostly in document order,
+        and (b) reproducible.  For arrays, objects are processed in
+        array order, and for dicts, they are processed in key order.
+    '''
+    container = (PdfDict, PdfArray)
+
+    # Allow passing a list of pages, or a dict
+    if isinstance(source, PdfDict):
+        source = [source]
+    else:
+        source = list(source)
+
+    visited = set()
+    source.reverse()
+    while source:
+        obj = source.pop()
+        if not isinstance(obj, container):
+            continue
+        myid = id(obj)
+        if myid in visited:
+            continue
+        visited.add(myid)
+        if isinstance(obj, PdfDict):
+            if obj.Type in valid_types and obj.Subtype in valid_subtypes:
+                yield obj
+            obj = [y for (x, y) in sorted(obj.iteritems())
+                   if x not in no_follow]
+        else:
+            # TODO: This forces resolution of any indirect objects in
+            # the array.  It may not be necessary.  Don't know if
+            # reversed() does any voodoo underneath the hood.
+            # It's cheap enough for now, but might be removeable.
+            obj and obj[0]
+        source.extend(reversed(obj))
+
+
+def wrap_object(obj, width, margin):
+    ''' Wrap an xobj in its own page object.
+    '''
+    fmt = 'q %s 0 0 %s %s %s cm /MyImage Do Q'
+    contents = PdfDict(indirect=True)
+    subtype = obj.Subtype
+    if subtype == PdfName.Form:
+        contents._stream = obj.stream
+        contents.Length = obj.Length
+        contents.Filter = obj.Filter
+        contents.DecodeParms = obj.DecodeParms
+        resources = obj.Resources
+        mbox = obj.BBox
+    elif subtype == PdfName.Image:  # Image
+        xoffset = margin[0]
+        yoffset = margin[1]
+        cw = width - margin[0] - margin[2]
+        iw, ih = float(obj.Width), float(obj.Height)
+        ch = 1.0 * cw / iw * ih
+        height = ch + margin[1] + margin[3]
+        p = tuple(('%.9f' % x).rstrip('0').rstrip('.') for x in (cw, ch, xoffset, yoffset))
+        contents.stream = fmt % p
+        resources = PdfDict(XObject=PdfDict(MyImage=obj))
+        mbox = PdfArray((0, 0, width, height))
+    else:
+        raise TypeError("Expected Form or Image XObject")
+
+    return PdfDict(
+        indirect=True,
+        Type=PdfName.Page,
+        MediaBox=mbox,
+        Resources=resources,
+        Contents=contents,
+        )
+
+
+def trivial_xobjs(maxignore=300):
+    ''' Ignore XObjects that trivially contain other XObjects.
+    '''
+    ignore = set('q Q cm Do'.split())
+    Image = PdfName.Image
+
+    def check(obj):
+        if obj.Subtype == Image:
+            return False
+        s = obj.stream
+        if len(s) < maxignore:
+            s = (x for x in s.split() if not x.startswith('/') and
+                 x not in ignore)
+            s = (x.replace('.', '').replace('-', '') for x in s)
+            if not [x for x in s if not x.isdigit()]:
+                return True
+    return check
+
+
+def page_per_xobj(xobj_iter, width=8.5 * 72, margin=0.0 * 72,
+                  image_only=False, ignore=trivial_xobjs(),
+                  wrap_object=wrap_object):
+    ''' page_per_xobj wraps every XObj found
+        in its own page object.
+        width and margin are used to set image sizes.
+    '''
+    try:
+        iter(margin)
+    except:
+        margin = [margin]
+    while len(margin) < 4:
+        margin *= 2
+
+    if isinstance(xobj_iter, (list, dict)):
+        xobj_iter = find_objects(xobj_iter)
+    for obj in xobj_iter:
+        if not ignore(obj):
+            if not image_only or obj.Subtype == PdfName.IMage:
+                yield wrap_object(obj, width, margin)
--- a/pdfrw/objects/init.py
+++ b/pdfrw/objects/init.py
@ -0,0 +1,19 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+'''
+Objects that can occur in PDF files.  The most important
+objects are arrays and dicts.  Either of these can be
+indirect or not, and dicts could have an associated
+stream.
+'''
+from .pdfname import PdfName
+from .pdfdict import PdfDict, IndirectPdfDict
+from .pdfarray import PdfArray
+from .pdfobject import PdfObject
+from .pdfstring import PdfString
+from .pdfindirect import PdfIndirect
+
+__all__ = """PdfName PdfDict IndirectPdfDict PdfArray
+             PdfObject PdfString PdfIndirect""".split()
--- a/pdfrw/objects/pdfarray.py
+++ b/pdfrw/objects/pdfarray.py
@ -0,0 +1,71 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+from .pdfindirect import PdfIndirect
+from .pdfobject import PdfObject
+
+
+def _resolved():
+    pass
+
+
+class PdfArray(list):
+    ''' A PdfArray maps the PDF file array object into a Python list.
+        It has an indirect attribute which defaults to False.
+    '''
+    indirect = False
+
+    def __init__(self, source=[]):
+        self._resolve = self._resolver
+        self.extend(source)
+
+    def _resolver(self, isinstance=isinstance, enumerate=enumerate,
+                  listiter=list.__iter__, PdfIndirect=PdfIndirect,
+                  resolved=_resolved, PdfNull=PdfObject('null')):
+        for index, value in enumerate(list.__iter__(self)):
+                if isinstance(value, PdfIndirect):
+                    value = value.real_value()
+                    if value is None:
+                        value = PdfNull
+                    self[index] = value
+        self._resolve = resolved
+
+    def __getitem__(self, index, listget=list.__getitem__):
+        self._resolve()
+        return listget(self, index)
+
+    try:
+        def __getslice__(self, i, j, listget=list.__getslice__):
+            self._resolve()
+            return listget(self, i, j)
+    except AttributeError:
+        pass
+
+    def __iter__(self, listiter=list.__iter__):
+        self._resolve()
+        return listiter(self)
+
+    def count(self, item):
+        self._resolve()
+        return list.count(self, item)
+
+    def index(self, item):
+        self._resolve()
+        return list.index(self, item)
+
+    def remove(self, item):
+        self._resolve()
+        return list.remove(self, item)
+
+    def sort(self, *args, **kw):
+        self._resolve()
+        return list.sort(self, *args, **kw)
+
+    def pop(self, *args):
+        self._resolve()
+        return list.pop(self, *args)
+
+    def __reversed__(self):
+        self._resolve()
+        return list.__reversed__(self)
--- a/pdfrw/objects/pdfdict.py
+++ b/pdfrw/objects/pdfdict.py
@ -0,0 +1,241 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+from .pdfname import PdfName, BasePdfName
+from .pdfindirect import PdfIndirect
+from .pdfobject import PdfObject
+from ..py23_diffs import iteritems
+from ..errors import PdfParseError
+
+
+class _DictSearch(object):
+    '''  Used to search for inheritable attributes.
+    '''
+
+    def __init__(self, basedict):
+        self.basedict = basedict
+
+    def __getattr__(self, name, PdfName=PdfName):
+        return self[PdfName(name)]
+
+    def __getitem__(self, name, set=set, getattr=getattr, id=id):
+        visited = set()
+        mydict = self.basedict
+        while 1:
+            value = mydict[name]
+            if value is not None:
+                return value
+            myid = id(mydict)
+            assert myid not in visited
+            visited.add(myid)
+            mydict = mydict.Parent
+            if mydict is None:
+                return
+
+
+class _Private(object):
+    ''' Used to store private attributes (not output to PDF files)
+        on PdfDict classes
+    '''
+
+    def __init__(self, pdfdict):
+        vars(self)['pdfdict'] = pdfdict
+
+    def __setattr__(self, name, value):
+        vars(self.pdfdict)[name] = value
+
+
+class PdfDict(dict):
+    ''' PdfDict objects are subclassed dictionaries
+        with the following features:
+
+        - Every key in the dictionary starts with "/"
+
+        - A dictionary item can be deleted by assigning it to None
+
+        - Keys that (after the initial "/") conform to Python
+          naming conventions can also be accessed (set and retrieved)
+          as attributes of the dictionary.  E.g.  mydict.Page is the
+          same thing as mydict['/Page']
+
+        - Private attributes (not in the PDF space) can be set
+          on the dictionary object attribute dictionary by using
+          the private attribute:
+
+                mydict.private.foo = 3
+                mydict.foo = 5
+                x = mydict.foo       # x will now contain 3
+                y = mydict['/foo']   # y will now contain 5
+
+          Most standard adobe dictionary keys start with an upper case letter,
+          so to avoid conflicts, it is best to start private attributes with
+          lower case letters.
+
+        - PdfDicts have the following read-only properties:
+
+            - private -- as discussed above, provides write access to
+                         dictionary's attributes
+            - inheritable -- this creates and returns a "view" attribute
+                         that will search through the object hierarchy for
+                         any desired attribute, such as /Rotate or /MediaBox
+
+        - PdfDicts also have the following special attributes:
+            - indirect is not stored in the PDF dictionary, but in the object's
+              attribute dictionary
+            - stream is also stored in the object's attribute dictionary
+              and will also update the stream length.
+            - _stream will store in the object's attribute dictionary without
+              updating the stream length.
+
+            It is possible, for example, to have a PDF name such as "/indirect"
+            or "/stream", but you cannot access such a name as an attribute:
+
+                mydict.indirect -- accesses object's attribute dictionary
+                mydict["/indirect"] -- accesses actual PDF dictionary
+    '''
+    indirect = False
+    stream = None
+
+    _special = dict(indirect=('indirect', False),
+                    stream=('stream', True),
+                    _stream=('stream', False),
+                    )
+
+    def __setitem__(self, name, value, setter=dict.__setitem__,
+                    BasePdfName=BasePdfName, isinstance=isinstance):
+        if not isinstance(name, BasePdfName):
+            raise PdfParseError('Dict key %s is not a PdfName' % repr(name))
+        if value is not None:
+            setter(self, name, value)
+        elif name in self:
+            del self[name]
+
+    def __init__(self, *args, **kw):
+        if args:
+            if len(args) == 1:
+                args = args[0]
+            self.update(args)
+            if isinstance(args, PdfDict):
+                self.indirect = args.indirect
+                self._stream = args.stream
+        for key, value in iteritems(kw):
+            setattr(self, key, value)
+
+    def __getattr__(self, name, PdfName=PdfName):
+        ''' If the attribute doesn't exist on the dictionary object,
+            try to slap a '/' in front of it and get it out
+            of the actual dictionary itself.
+        '''
+        return self.get(PdfName(name))
+
+    def get(self, key, dictget=dict.get, isinstance=isinstance,
+            PdfIndirect=PdfIndirect):
+        ''' Get a value out of the dictionary,
+            after resolving any indirect objects.
+        '''
+        value = dictget(self, key)
+        if isinstance(value, PdfIndirect):
+            # We used to use self[key] here, but that does an
+            # unwanted check on the type of the key (github issue #98).
+            # Python will keep the old key object in the dictionary,
+            # so that check is not necessary.
+            value = value.real_value()
+            if value is not None:
+                dict.__setitem__(self, key, value)
+            else:
+                del self[name]
+        return value
+
+    def __getitem__(self, key):
+        return self.get(key)
+
+    def __setattr__(self, name, value, special=_special.get,
+                    PdfName=PdfName, vars=vars):
+        ''' Set an attribute on the dictionary.  Handle the keywords
+            indirect, stream, and _stream specially (for content objects)
+        '''
+        info = special(name)
+        if info is None:
+            self[PdfName(name)] = value
+        else:
+            name, setlen = info
+            vars(self)[name] = value
+            if setlen:
+                notnone = value is not None
+                self.Length = notnone and PdfObject(len(value)) or None
+
+    def iteritems(self, dictiter=iteritems,
+                  isinstance=isinstance, PdfIndirect=PdfIndirect,
+                  BasePdfName=BasePdfName):
+        ''' Iterate over the dictionary, resolving any unresolved objects
+        '''
+        for key, value in list(dictiter(self)):
+            if isinstance(value, PdfIndirect):
+                self[key] = value = value.real_value()
+            if value is not None:
+                if not isinstance(key, BasePdfName):
+                    raise PdfParseError('Dict key %s is not a PdfName' %
+                                        repr(key))
+                yield key, value
+
+    def items(self):
+        return list(self.iteritems())
+
+    def itervalues(self):
+        for key, value in self.iteritems():
+            yield value
+
+    def values(self):
+        return list((value for key, value in self.iteritems()))
+
+    def keys(self):
+        return list((key for key, value in self.iteritems()))
+
+    def __iter__(self):
+        for key, value in self.iteritems():
+            yield key
+
+    def iterkeys(self):
+        return iter(self)
+
+    def copy(self):
+        return type(self)(self)
+
+    def pop(self, key):
+        value = self.get(key)
+        del self[key]
+        return value
+
+    def popitem(self):
+        key, value = dict.pop(self)
+        if isinstance(value, PdfIndirect):
+            value = value.real_value()
+        return value
+
+    def inheritable(self):
+        ''' Search through ancestors as needed for inheritable
+            dictionary items.
+            NOTE:  You might think it would be a good idea
+            to cache this class, but then you'd have to worry
+            about it pointing to the wrong dictionary if you
+            made a copy of the object...
+        '''
+        return _DictSearch(self)
+    inheritable = property(inheritable)
+
+    def private(self):
+        ''' Allows setting private metadata for use in
+            processing (not sent to PDF file).
+            See note on inheritable
+        '''
+        return _Private(self)
+    private = property(private)
+
+
+class IndirectPdfDict(PdfDict):
+    ''' IndirectPdfDict is a convenience class.  You could
+        create a direct PdfDict and then set indirect = True on it,
+        or you could just create an IndirectPdfDict.
+    '''
+    indirect = True
--- a/pdfrw/objects/pdfindirect.py
+++ b/pdfrw/objects/pdfindirect.py
@ -0,0 +1,22 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+
+class _NotLoaded(object):
+    pass
+
+
+class PdfIndirect(tuple):
+    ''' A placeholder for an object that hasn't been read in yet.
+        The object itself is the (object number, generation number) tuple.
+        The attributes include information about where the object is
+        referenced from and the file object to retrieve the real object from.
+    '''
+    value = _NotLoaded
+
+    def real_value(self, NotLoaded=_NotLoaded):
+        value = self.value
+        if value is NotLoaded:
+            value = self.value = self._loader(self)
+        return value
--- a/pdfrw/objects/pdfname.py
+++ b/pdfrw/objects/pdfname.py
@ -0,0 +1,81 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+import re
+
+from ..errors import log
+
+warn = log.warning
+
+
+class BasePdfName(str):
+    ''' A PdfName is an identifier that starts with
+        a slash.
+
+        If a PdfName has illegal space or delimiter characters,
+        then it will be decorated with an "encoded" attribute that
+        has those characters properly escaped as #<hex><hex>
+
+        The "encoded" attribute is what is sent out to a PDF file,
+        the non-encoded main object is what is compared for equality
+        in a PDF dictionary.
+    '''
+
+    indirect = False
+    encoded = None
+
+    whitespace = '\x00 \t\f\r\n'
+    delimiters = '()<>{}[]/%'
+    forbidden = list(whitespace) + list('\\' + x for x in delimiters)
+    remap = dict((x, '#%02X' % ord(x)) for x in (whitespace + delimiters))
+    split_to_encode = re.compile('(%s)' % '|'.join(forbidden)).split
+    split_to_decode = re.compile(r'\#([0-9A-Fa-f]{2})').split
+
+    def __new__(cls, name, pre_encoded=True, remap=remap,
+                join=''.join, new=str.__new__, chr=chr, int=int,
+                split_to_encode=split_to_encode,
+                split_to_decode=split_to_decode,
+                ):
+        ''' We can build a PdfName from scratch, or from
+            a pre-encoded name (e.g. coming in from a file).
+        '''
+        # Optimization for normal case
+        if name[1:].isalnum():
+            return new(cls, name)
+        encoded = name
+        if pre_encoded:
+            if '#' in name:
+                substrs = split_to_decode(name)
+                substrs[1::2] = (chr(int(x, 16)) for x in substrs[1::2])
+                name = join(substrs)
+        else:
+            encoded = split_to_encode(encoded)
+            encoded[3::2] = (remap[x] for x in encoded[3::2])
+            encoded = join(encoded)
+        self = new(cls, name)
+        if encoded != name:
+            self.encoded = encoded
+        return self
+
+
+# We could have used a metaclass, but this matches what
+# we were doing historically.
+
+class PdfName(object):
+    ''' Two simple ways to get a PDF name from a string:
+
+                x = PdfName.FooBar
+                x = pdfName('FooBar')
+
+        Either technique will return "/FooBar"
+
+    '''
+
+    def __getattr__(self, name, BasePdfName=BasePdfName):
+        return BasePdfName('/' + name, False)
+
+    def __call__(self, name, BasePdfName=BasePdfName):
+        return BasePdfName('/' + name, False)
+
+PdfName = PdfName()
--- a/pdfrw/objects/pdfobject.py
+++ b/pdfrw/objects/pdfobject.py
@ -0,0 +1,11 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+
+class PdfObject(str):
+    ''' A PdfObject is a textual representation of any PDF file object
+        other than an array, dict or string. It has an indirect attribute
+        which defaults to False.
+    '''
+    indirect = False
--- a/pdfrw/objects/pdfstring.py
+++ b/pdfrw/objects/pdfstring.py
@ -0,0 +1,553 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas
+#                    2016 James Laird-Wah, Sydney, Australia
+# MIT license -- See LICENSE.txt for details
+
+"""
+
+================================
+PdfString encoding and decoding
+================================
+
+Introduction
+=============
+
+
+This module handles encoding and decoding of PDF strings.  PDF strings
+are described in the PDF 1.7 reference manual, mostly in chapter 3
+(sections 3.2 and 3.8) and chapter 5.
+
+PDF strings are used in the document structure itself, and also inside
+the stream of page contents dictionaries.
+
+A PDF string can represent pure binary data (e.g. for a font or an
+image), or text, or glyph indices.  For Western fonts, the glyph indices
+usually correspond to ASCII, but that is not guaranteed.  (When it does
+happen, it makes examination of raw PDF data a lot easier.)
+
+The specification defines PDF string encoding at two different levels.
+At the bottom, it defines ways to encode arbitrary bytes so that a PDF
+tokenizer can understand they are a string of some sort, and can figure
+out where the string begins and ends.  (That is all the tokenizer itself
+cares about.)  Above that level, if the string represents text, the
+specification defines ways to encode Unicode text into raw bytes, before
+the byte encoding is performed.
+
+There are two ways to do the byte encoding, and two ways to do the text
+(Unicode) encoding.
+
+Encoding bytes into PDF strings
+================================
+
+Adobe calls the two ways to encode bytes into strings "Literal strings"
+and "Hexadecimal strings."
+
+Literal strings
+------------------
+
+A literal string is delimited by ASCII parentheses ("(" and ")"), and a
+hexadecimal string is delimited by ASCII less-than and greater-than
+signs ("<" and ">").
+
+A literal string may encode bytes almost unmolested.  The caveat is
+that if a byte has the same value as a parenthesis, it must be escaped
+so that the tokenizer knows the string is not finished.  This is accomplished
+by using the ASCII backslash ("\") as an escape character.  Of course,
+now any backslash appearing in the data must likewise be escaped.
+
+Hexadecimal strings
+---------------------
+
+A hexadecimal string requires twice as much space as the source data
+it represents (plus two bytes for the delimiter), simply storing each
+byte as two hexadecimal digits, most significant digit first.  The spec
+allows for lower or upper case hex digits, but most PDF encoders seem
+to use upper case.
+
+Special cases -- Legacy systems and readability
+-----------------------------------------------
+
+It is possible to create a PDF document that uses 7 bit ASCII encoding,
+and it is desirable in many cases to create PDFs that are reasonably
+readable when opened in a text editor.  For these reasons, the syntax
+for both literal strings and hexadecimal strings is slightly more
+complicated that the initial description above.  In general, the additional
+syntax allows the following features:
+
+  - Making the delineation between characters, or between sections of
+    a string, apparent, and easy to see in an editor.
+  - Keeping output lines from getting too wide for some editors
+  - Keeping output lines from being so narrow that you can only see the
+    small fraction of a string at a time in an editor.
+  - Suppressing unprintable characters
+  - Restricting the output string to 7 bit ASCII
+
+Hexadecimal readability
+~~~~~~~~~~~~~~~~~~~~~~~
+
+For hexadecimal strings, only the first two bullets are relevant.  The syntax
+to accomplish this is simple, allowing any ASCII whitespace to be inserted
+anywhere in the encoded hex string.
+
+Literal readability
+~~~~~~~~~~~~~~~~~~~
+
+For literal strings, all of the bullets except the first are relevant.
+The syntax has two methods to help with these goals.  The first method
+is to overload the escape operator to be able to do different functions,
+and the second method can reduce the number of escapes required for
+parentheses in the normal case.
+
+The escape function works differently, depending on what byte follows
+the backslash.  In all cases, the escaping backslash is discarded,
+and then the next character is examined:
+
+  - For parentheses and backslashes (and, in fact, for all characters
+    not described otherwise in this list), the character after the
+    backslash is preserved in the output.
+  - A letter from the set of "nrtbf" following a backslash is interpreted as
+    a line feed, carriage return, tab, backspace, or form-feed, respectively.
+  - One to three octal digits following the backslash indicate the
+    numeric value of the encoded byte.
+  - A carriage return, carriage return/line feed, or line feed following
+    the backslash indicates a line break that was put in for readability,
+    and that is not part of the actual data, so this is discarded.
+
+The second method that can be used to improve readability (and reduce space)
+in literal strings is to not escape parentheses.  This only works, and is
+only allowed, when the parentheses are properly balanced.  For example,
+"((Hello))" is a valid encoding for a literal string, but "((Hello)" is not;
+the latter case should be encoded "(\(Hello)"
+
+Encoding text into strings
+==========================
+
+Section 3.8.1 of the PDF specification describes text strings.
+
+The individual characters of a text string can all be considered to
+be Unicode; Adobe specifies two different ways to encode these characters
+into a string of bytes before further encoding the byte string as a
+literal string or a hexadecimal string.
+
+The first way to encode these strings is called PDFDocEncoding.  This
+is mostly a one-for-one mapping of bytes into single bytes, similar to
+Latin-1.  The representable character set is limited to the number of
+characters that can fit in a byte, and this encoding cannot be used
+with Unicode strings that start with the two characters making up the
+UTF-16-BE BOM.
+
+The second way to encode these strings is with UTF-16-BE.  Text strings
+encoded with this method must start with the BOM, and although the spec
+does not appear to mandate that the resultant bytes be encoded into a
+hexadecimal string, that seems to be the canonical way to do it.
+
+When encoding a string into UTF-16-BE, this module always adds the BOM,
+and when decoding a string from UTF-16-BE, this module always strips
+the BOM.  If a source string contains a BOM, that will remain in the
+final string after a round-trip through the encoder and decoder, as
+the goal of the encoding/decoding process is transparency.
+
+
+PDF string handling in pdfrw
+=============================
+
+Responsibility for handling PDF strings in the pdfrw library is shared
+between this module, the tokenizer, and the pdfwriter.
+
+tokenizer string handling
+--------------------------
+
+As far as the tokenizer and its clients such as the pdfreader are concerned,
+the PdfString class must simply be something that it can instantiate by
+passing a string, that doesn't compare equal (or throw an exception when
+compared) to other possible token strings.  The tokenizer must understand
+enough about the syntax of the string to successfully find its beginning
+and end in a stream of tokens, but doesn't otherwise know or care about
+the data represented by the string.
+
+pdfwriter string handling
+--------------------------
+
+The pdfwriter knows and cares about two attributes of PdfString instances:
+
+  - First, PdfString objects have an 'indirect' attribute, which pdfwriter
+    uses as an indication that the object knows how to represent itself
+    correctly when output to a new PDF.  (In the case of a PdfString object,
+    no work is really required, because it is already a string.)
+  - Second, the PdfString.encode() method is used as a convenience to
+    automatically convert any user-supplied strings (that didn't come
+    from PDFs) when a PDF is written out to a file.
+
+pdfstring handling
+-------------------
+
+The code in this module is designed to support those uses by the
+tokenizer and the pdfwriter, and to additionally support encoding
+and decoding of PdfString objects as a convenience for the user.
+
+Most users of the pdfrw library never encode or decode a PdfString,
+so it is imperative that (a) merely importing this module does not
+take a significant amount of CPU time; and (b) it is cheap for the
+tokenizer to produce a PdfString, and cheap for the pdfwriter to
+consume a PdfString -- if the tokenizer finds a string that conforms
+to the PDF specification, it will be wrapped in a PdfString object,
+and if the pdfwriter finds an object with an indirect attribute, it
+simply calls str() to ask it to format itself.
+
+Encoding and decoding are not actually performed very often at all,
+compared to how often tokenization and then subsequent concatenation
+by the pdfwriter are performed.  In fact, versions of pdfrw prior to
+0.4 did not even support Unicode for this function.  Encoding and
+decoding can also easily be performed by the user, outside of the
+library, and this might still be recommended, at least for encoding,
+if the visual appeal of encodings generated by this module is found
+lacking.
+
+
+Decoding strings
+~~~~~~~~~~~~~~~~~~~
+
+Decoding strings can be tricky, but is a bounded process.  Each
+properly-encoded encoded string represents exactly one output string,
+with the caveat that is up to the caller of the function to know whether
+he expects a Unicode string, or just bytes.
+
+The caller can call PdfString.to_bytes() to get a byte string (which may
+or may not represent encoded Unicode), or may call PdfString.to_unicode()
+to get a Unicode string.  Byte strings will be regular strings in Python 2,
+and b'' bytes in Python 3; Unicode strings will be regular strings in
+Python 3, and u'' unicode strings in Python 2.
+
+To maintain application compatibility with earlier versions of pdfrw,
+PdfString.decode() is an alias for PdfString.to_unicode().
+
+Encoding strings
+~~~~~~~~~~~~~~~~~~
+
+PdfString has three factory functions that will encode strings into
+PdfString objects:
+
+  -  PdfString.from_bytes() accepts a byte string (regular string in Python 2
+     or b'' bytes string in Python 3) and returns a PdfString object.
+  -  PdfString.from_unicode() accepts a Unicode string (u'' Unicode string in
+     Python 2 or regular string in Python 3) and returns a PdfString object.
+  -  PdfString.encode() examines the type of object passed, and either
+     calls from_bytes() or from_unicode() to do the real work.
+
+Unlike decoding(), encoding is not (mathematically) a function.
+There are (literally) an infinite number of ways to encode any given
+source string.  (Of course, most of them would be stupid, unless
+the intent is some sort of denial-of-service attack.)
+
+So encoding strings is either simpler than decoding, or can be made to
+be an open-ended science fair project (to create the best looking
+encoded strings).
+
+There are parameters to the encoding functions that allow control over
+the final encoded string, but the intention is to make the default values
+produce a reasonable encoding.
+
+As mentioned previously, if encoding does not do what a particular
+user needs, that user is free to write his own encoder, and then
+simply instantiate a PdfString object by passing a string to the
+default constructor, the same way that the tokenizer does it.
+
+However, if desirable, encoding may gradually become more capable
+over time, adding the ability to generate more aesthetically pleasing
+encoded strings.
+
+PDFDocString encoding and decoding
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To handle this encoding in a fairly standard way, this module registers
+an encoder and decoder for PDFDocEncoding with the codecs module.
+
+"""
+
+import re
+import codecs
+import binascii
+import itertools
+from ..py23_diffs import convert_load, convert_store
+
+def find_pdfdocencoding(encoding):
+    """ This function conforms to the codec module registration
+        protocol.  It defers calculating data structures until
+        a pdfdocencoding encode or decode is required.
+
+        PDFDocEncoding is described in the PDF 1.7 reference manual.
+    """
+
+    if encoding != 'pdfdocencoding':
+        return
+
+    # Create the decoding map based on the table in section D.2 of the
+    # PDF 1.7 manual
+
+    # Start off with the characters with 1:1 correspondence
+    decoding_map = set(range(0x20, 0x7F)) | set(range(0xA1, 0x100))
+    decoding_map.update((0x09, 0x0A, 0x0D))
+    decoding_map.remove(0xAD)
+    decoding_map = dict((x, x) for x in decoding_map)
+
+    # Add in the special Unicode characters
+    decoding_map.update(zip(range(0x18, 0x20), (
+            0x02D8, 0x02C7, 0x02C6, 0x02D9, 0x02DD, 0x02DB, 0x02DA, 0x02DC)))
+    decoding_map.update(zip(range(0x80, 0x9F), (
+            0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
+            0x2039, 0x203A, 0x2212, 0x2030, 0x201E, 0x201C, 0x201D, 0x2018,
+            0x2019, 0x201A, 0x2122, 0xFB01, 0xFB02, 0x0141, 0x0152, 0x0160,
+            0x0178, 0x017D, 0x0131, 0x0142, 0x0153, 0x0161, 0x017E)))
+    decoding_map[0xA0] = 0x20AC
+
+    # Make the encoding map from the decoding map
+    encoding_map = codecs.make_encoding_map(decoding_map)
+
+    # Not every PDF producer follows the spec, so conform to Postel's law
+    # and interpret encoded strings if at all possible.  In particular, they
+    # might have nulls and form-feeds, judging by random code snippets
+    # floating around the internet.
+    decoding_map.update(((x, x) for x in range(0x18)))
+
+    def encode(input, errors='strict'):
+        return codecs.charmap_encode(input, errors, encoding_map)
+
+    def decode(input, errors='strict'):
+        return codecs.charmap_decode(input, errors, decoding_map)
+
+    return codecs.CodecInfo(encode, decode, name='pdfdocencoding')
+
+codecs.register(find_pdfdocencoding)
+
+class PdfString(str):
+    """ A PdfString is an encoded string.  It has a decode
+        method to get the actual string data out, and there
+        is an encode class method to create such a string.
+        Like any PDF object, it could be indirect, but it
+        defaults to being a direct object.
+    """
+    indirect = False
+
+
+    # The byte order mark, and unicode that could be
+    # wrongly encoded into the byte order mark by the
+    # pdfdocencoding codec.
+
+    bytes_bom = codecs.BOM_UTF16_BE
+    bad_pdfdoc_prefix = bytes_bom.decode('latin-1')
+
+    # Used by decode_literal; filled in on first use
+
+    unescape_dict = None
+    unescape_func = None
+
+    @classmethod
+    def init_unescapes(cls):
+        """ Sets up the unescape attributes for decode_literal
+        """
+        unescape_pattern = r'\\([0-7]{1,3}|\r\n|.)'
+        unescape_func = re.compile(unescape_pattern, re.DOTALL).split
+        cls.unescape_func = unescape_func
+
+        unescape_dict = dict(((chr(x), chr(x)) for x in range(0x100)))
+        unescape_dict.update(zip('nrtbf', '\n\r\t\b\f'))
+        unescape_dict['\r'] = ''
+        unescape_dict['\n'] = ''
+        unescape_dict['\r\n'] = ''
+        for i in range(0o10):
+            unescape_dict['%01o' % i] = chr(i)
+        for i in range(0o100):
+            unescape_dict['%02o' % i] = chr(i)
+        for i in range(0o400):
+            unescape_dict['%03o' % i] = chr(i)
+        cls.unescape_dict = unescape_dict
+        return unescape_func
+
+    def decode_literal(self):
+        """ Decode a PDF literal string, which is enclosed in parentheses ()
+
+            Many pdfrw users never decode strings, so defer creating
+            data structures to do so until the first string is decoded.
+
+            Possible string escapes from the spec:
+            (PDF 1.7 Reference, section 3.2.3, page 53)
+
+                1. \[nrtbf\()]: simple escapes
+                2. \\d{1,3}: octal. Must be zero-padded to 3 digits
+                    if followed by digit
+                3. \<end of line>: line continuation. We don't know the EOL
+                    marker used in the PDF, so accept \r, \n, and \r\n.
+                4. Any other character following \ escape -- the backslash
+                    is swallowed.
+        """
+        result = (self.unescape_func or self.init_unescapes())(self[1:-1])
+        if len(result) == 1:
+            return convert_store(result[0])
+        unescape_dict = self.unescape_dict
+        result[1::2] = [unescape_dict[x] for x in result[1::2]]
+        return convert_store(''.join(result))
+
+
+    def decode_hex(self):
+        """ Decode a PDF hexadecimal-encoded string, which is enclosed
+            in angle brackets <>.
+        """
+        hexstr = convert_store(''.join(self[1:-1].split()))
+        if len(hexstr) % 1: # odd number of chars indicates a truncated 0
+            hexstr += '0'
+        return binascii.unhexlify(hexstr)
+
+
+    def to_bytes(self):
+        """ Decode a PDF string to bytes.  This is a convenience function
+            for user code, in that (as of pdfrw 0.3) it is never
+            actually used inside pdfrw.
+        """
+        if self.startswith('(') and self.endswith(')'):
+            return self.decode_literal()
+
+        elif self.startswith('<') and self.endswith('>'):
+            return self.decode_hex()
+
+        else:
+            raise ValueError('Invalid PDF string "%s"' % repr(self))
+
+    def to_unicode(self):
+        """ Decode a PDF string to a unicode string.  This is a
+            convenience function for user code, in that (as of
+            pdfrw 0.3) it is never actually used inside pdfrw.
+
+            There are two Unicode storage methods used -- either
+            UTF16_BE, or something called PDFDocEncoding, which
+            is defined in the PDF spec.  The determination of
+            which decoding method to use is done by examining the
+            first two bytes for the byte order marker.
+        """
+        raw = self.to_bytes()
+
+        if raw[:2] == self.bytes_bom:
+            return raw[2:].decode('utf-16-be')
+        else:
+            return raw.decode('pdfdocencoding')
+
+    # Legacy-compatible interface
+    decode = to_unicode
+
+    # Internal value used by encoding
+
+    escape_splitter = None  # Calculated on first use
+
+    @classmethod
+    def init_escapes(cls):
+        """ Initialize the escape_splitter for the encode method
+        """
+        cls.escape_splitter = re.compile(br'(\(|\\|\))').split
+        return cls.escape_splitter
+
+    @classmethod
+    def from_bytes(cls, raw, bytes_encoding='auto'):
+        """ The from_bytes() constructor is called to encode a source raw
+            byte string into a PdfString that is suitable for inclusion
+            in a PDF.
+
+            NOTE:  There is no magic in the encoding process.  A user
+            can certainly do his own encoding, and simply initialize a
+            PdfString() instance with his encoded string.  That may be
+            useful, for example, to add line breaks to make it easier
+            to load PDFs into editors, or to not bother to escape balanced
+            parentheses, or to escape additional characters to make a PDF
+            more readable in a file editor.  Those are features not
+            currently supported by this method.
+
+            from_bytes() can use a heuristic to figure out the best
+            encoding for the string, or the user can control the process
+            by changing the bytes_encoding parameter to 'literal' or 'hex'
+            to force a particular conversion method.
+        """
+
+        # If hexadecimal is not being forced, then figure out how long
+        # the escaped literal string will be, and fall back to hex if
+        # it is too long.
+
+        force_hex = bytes_encoding == 'hex'
+        if not force_hex:
+            if bytes_encoding not in ('literal', 'auto'):
+                raise ValueError('Invalid bytes_encoding value: %s'
+                                 % bytes_encoding)
+            splitlist = (cls.escape_splitter or cls.init_escapes())(raw)
+            if bytes_encoding == 'auto' and len(splitlist) // 2 >= len(raw):
+                force_hex = True
+
+        if force_hex:
+            # The spec does not mandate uppercase,
+            # but it seems to be the convention.
+            fmt = '<%s>'
+            result = binascii.hexlify(raw).upper()
+        else:
+            fmt = '(%s)'
+            splitlist[1::2] = [(b'\\' + x) for x in splitlist[1::2]]
+            result = b''.join(splitlist)
+
+        return cls(fmt % convert_load(result))
+
+    @classmethod
+    def from_unicode(cls, source, text_encoding='auto',
+                     bytes_encoding='auto'):
+        """ The from_unicode() constructor is called to encode a source
+            string into a PdfString that is suitable for inclusion in a PDF.
+
+            NOTE:  There is no magic in the encoding process.  A user
+            can certainly do his own encoding, and simply initialize a
+            PdfString() instance with his encoded string.  That may be
+            useful, for example, to add line breaks to make it easier
+            to load PDFs into editors, or to not bother to escape balanced
+            parentheses, or to escape additional characters to make a PDF
+            more readable in a file editor.  Those are features not
+            supported by this method.
+
+            from_unicode() can use a heuristic to figure out the best
+            encoding for the string, or the user can control the process
+            by changing the text_encoding parameter to 'pdfdocencoding'
+            or 'utf16', and/or by changing the bytes_encoding parameter
+            to 'literal' or 'hex' to force particular conversion methods.
+
+            The function will raise an exception if it cannot perform
+            the conversion as requested by the user.
+        """
+
+        # Give preference to pdfdocencoding, since it only
+        # requires one raw byte per character, rather than two.
+        if text_encoding != 'utf16':
+            force_pdfdoc = text_encoding == 'pdfdocencoding'
+            if text_encoding != 'auto' and not force_pdfdoc:
+                raise ValueError('Invalid text_encoding value: %s'
+                                 % text_encoding)
+
+            if source.startswith(cls.bad_pdfdoc_prefix):
+                if force_pdfdoc:
+                    raise UnicodeError('Prefix of string %r cannot be encoded '
+                                       'in pdfdocencoding' % source[:20])
+            else:
+                try:
+                    raw = source.encode('pdfdocencoding')
+                except UnicodeError:
+                    if force_pdfdoc:
+                        raise
+                else:
+                    return cls.from_bytes(raw, bytes_encoding)
+
+        # If the user is not forcing literal strings,
+        # it makes much more sense to use hexadecimal with 2-byte chars
+        raw = cls.bytes_bom + source.encode('utf-16-be')
+        encoding = 'hex' if bytes_encoding == 'auto' else bytes_encoding
+        return cls.from_bytes(raw, encoding)
+
+    @classmethod
+    def encode(cls, source, uni_type = type(u''), isinstance=isinstance):
+        """ The encode() constructor is a legacy function that is
+            also a convenience for the PdfWriter.
+        """
+        if isinstance(source, uni_type):
+            return cls.from_unicode(source)
+        else:
+            return cls.from_bytes(source)
--- a/pdfrw/pagemerge.py
+++ b/pdfrw/pagemerge.py
@ -0,0 +1,250 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+'''
+This module contains code to edit pages.  Sort of a canvas, I
+suppose, but I wouldn't want to call it that and get people all
+excited or anything.
+
+No, this is just for doing basic things like merging/splitting
+apart pages, watermarking, etc.  All it does is allow converting
+pages (or parts of pages) into Form XObject rectangles, and then
+plopping those down on new or pre-existing pages.
+'''
+
+from .objects import PdfDict, PdfArray, PdfName
+from .buildxobj import pagexobj, ViewInfo
+
+NullInfo = ViewInfo()
+
+
+class RectXObj(PdfDict):
+    ''' This class facilitates doing positioning (moving and scaling)
+        of Form XObjects within their containing page, by modifying
+        the Form XObject's transformation matrix.
+
+        By default, this class keeps the aspect ratio locked.  For
+        example, if your object is foo, you can write 'foo.w = 200',
+        and it will scale in both the x and y directions.
+
+        To unlock the aspect ration, you have to do a tiny bit of math
+        and call the scale function.
+    '''
+    def __init__(self, page, viewinfo=NullInfo, **kw):
+        ''' The page is a page returned by PdfReader.  It will be
+            turned into a cached Form XObject (so that multiple
+            rectangles can be extracted from it if desired), and then
+            another Form XObject will be built using it and the viewinfo
+            (which should be a ViewInfo class).  The viewinfo includes
+            source coordinates (from the top/left) and rotation information.
+
+            Once the object has been built, its destination coordinates
+            may be examined and manipulated by using x, y, w, h, and
+            scale.  The destination coordinates are in the normal
+            PDF programmatic system (starting at bottom left).
+        '''
+        if kw:
+            if viewinfo is not NullInfo:
+                raise ValueError("Cannot modify preexisting ViewInfo")
+            viewinfo = ViewInfo(**kw)
+        viewinfo.cacheable = False
+        base = pagexobj(page, viewinfo)
+        self.update(base)
+        self.indirect = True
+        self.stream = base.stream
+        private = self.private
+        private._rect = [base.x, base.y, base.w, base.h]
+        matrix = self.Matrix
+        if matrix is None:
+            matrix = self.Matrix = PdfArray((1, 0, 0, 1, 0, 0))
+        private._matrix = matrix  # Lookup optimization
+        # Default to lower-left corner
+        self.x = 0
+        self.y = 0
+
+    @property
+    def x(self):
+        ''' X location (from left) of object in points
+        '''
+        return self._rect[0]
+
+    @property
+    def y(self):
+        ''' Y location (from bottom) of object in points
+        '''
+        return self._rect[1]
+
+    @property
+    def w(self):
+        ''' Width of object in points
+        '''
+        return self._rect[2]
+
+    @property
+    def h(self):
+        ''' Height of object in points
+        '''
+        return self._rect[3]
+
+    def __setattr__(self, name, value, next=PdfDict.__setattr__,
+                    mine=set('x y w h'.split())):
+        ''' The underlying __setitem__ won't let us use a property
+            setter, so we have to fake one.
+        '''
+        if name not in mine:
+            return next(self, name, value)
+        if name in 'xy':
+            r_index, m_index = (0, 4) if name == 'x' else (1, 5)
+            self._rect[r_index], old = value, self._rect[r_index]
+            self._matrix[m_index] += value - old
+        else:
+            index = 2 + (value == 'h')
+            self.scale(value / self._rect[index])
+
+    def scale(self, x_scale, y_scale=None):
+        ''' Current scaling deals properly with things that
+            have been rotated in 90 degree increments
+            (via the ViewMerge object given when instantiating).
+        '''
+        if y_scale is None:
+            y_scale = x_scale
+        x, y, w, h = rect = self._rect
+        ao, bo, co, do, eo, fo = matrix = self._matrix
+        an = ao * x_scale
+        bn = bo * y_scale
+        cn = co * x_scale
+        dn = do * y_scale
+        en = x + (eo - x) * 1.0 * (an + cn) / (ao + co)
+        fn = y + (fo - y) * 1.0 * (bn + dn) / (bo + do)
+        matrix[:] = an, bn, cn, dn, en, fn
+        rect[:] = x, y, w * x_scale, h * y_scale
+
+    @property
+    def box(self):
+        ''' Return the bounding box for the object
+        '''
+        x, y, w, h = self._rect
+        return PdfArray([x, y, x + w, y + h])
+
+
+class PageMerge(list):
+    ''' A PageMerge object can have 0 or 1 underlying pages
+        (that get edited with the results of the merge)
+        and 0-n RectXObjs that can be applied before or
+        after the underlying page.
+    '''
+    page = None
+    mbox = None
+    cbox = None
+    resources = None
+    rotate = None
+    contents = None
+
+    def __init__(self, page=None):
+        if page is not None:
+            self.setpage(page)
+
+    def setpage(self, page):
+        if page.Type != PdfName.Page:
+            raise TypeError("Expected page")
+        self.append(None)  # Placeholder
+        self.page = page
+        inheritable = page.inheritable
+        self.mbox = inheritable.MediaBox
+        self.cbox = inheritable.CropBox
+        self.resources = inheritable.Resources
+        self.rotate = inheritable.Rotate
+        self.contents = page.Contents
+
+    def __add__(self, other):
+        if isinstance(other, dict):
+            other = [other]
+        for other in other:
+            self.add(other)
+        return self
+
+    def add(self, obj, prepend=False, **kw):
+        if kw:
+            obj = RectXObj(obj, **kw)
+        elif obj.Type == PdfName.Page:
+            obj = RectXObj(obj)
+        if prepend:
+            self.insert(0, obj)
+        else:
+            self.append(obj)
+        return self
+
+    def render(self):
+        def do_xobjs(xobj_list, restore_first=False):
+            content = ['Q'] if restore_first else []
+            for obj in xobj_list:
+                index = PdfName('pdfrw_%d' % (key_offset + len(xobjs)))
+                if xobjs.setdefault(index, obj) is not obj:
+                    raise KeyError("XObj key %s already in use" % index)
+                content.append('%s Do' % index)
+            return PdfDict(indirect=True, stream='\n'.join(content))
+
+        mbox = self.mbox
+        cbox = self.cbox
+        page = self.page
+        old_contents = self.contents
+        resources = self.resources or PdfDict()
+
+        key_offset = 0
+        xobjs = resources.XObject
+        if xobjs is None:
+            xobjs = resources.XObject = PdfDict()
+        else:
+            allkeys = xobjs.keys()
+            if allkeys:
+                keys = (x for x in allkeys if x.startswith('/pdfrw_'))
+                keys = (x for x in keys if x[7:].isdigit())
+                keys = sorted(keys, key=lambda x: int(x[7:]))
+                key_offset = (int(keys[-1][7:]) + 1) if keys else 0
+                key_offset -= len(allkeys)
+
+        if old_contents is None:
+            new_contents = do_xobjs(self)
+        else:
+            isdict = isinstance(old_contents, PdfDict)
+            old_contents = [old_contents] if isdict else old_contents
+            new_contents = PdfArray()
+            index = self.index(None)
+            if index:
+                new_contents.append(do_xobjs(self[:index]))
+
+            index += 1
+            if index < len(self):
+                # There are elements to add after the original page contents,
+                # so push the graphics state to the stack. Restored below.
+                new_contents.append(PdfDict(indirect=True, stream='q'))
+
+            new_contents.extend(old_contents)
+
+            if index < len(self):
+                # Restore graphics state and add other elements.
+                new_contents.append(do_xobjs(self[index:], restore_first=True))
+
+        if mbox is None:
+            cbox = None
+            mbox = self.xobj_box
+            mbox[0] = min(0, mbox[0])
+            mbox[1] = min(0, mbox[1])
+
+        page = PdfDict(indirect=True) if page is None else page
+        page.Type = PdfName.Page
+        page.Resources = resources
+        page.MediaBox = mbox
+        page.CropBox = cbox
+        page.Rotate = self.rotate
+        page.Contents = new_contents
+        return page
+
+    @property
+    def xobj_box(self):
+        ''' Return the smallest box that encloses every object
+            in the list.
+        '''
+        a, b, c, d = zip(*(xobj.box for xobj in self))
+        return PdfArray((min(a), min(b), max(c), max(d)))
--- a/pdfrw/pdfreader.py
+++ b/pdfrw/pdfreader.py
@ -0,0 +1,691 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# Copyright (C) 2012-2015 Nerijus Mika
+# MIT license -- See LICENSE.txt for details
+
+'''
+The PdfReader class reads an entire PDF file into memory and
+parses the top-level container objects.  (It does not parse
+into streams.)  The object subclasses PdfDict, and the
+document pages are stored in a list in the pages attribute
+of the object.
+'''
+import gc
+import binascii
+import collections
+import itertools
+
+from .errors import PdfParseError, log
+from .tokens import PdfTokens
+from .objects import PdfDict, PdfArray, PdfName, PdfObject, PdfIndirect
+from .uncompress import uncompress
+from . import crypt
+from .py23_diffs import convert_load, convert_store, iteritems
+
+
+class PdfReader(PdfDict):
+
+    def findindirect(self, objnum, gennum, PdfIndirect=PdfIndirect, int=int):
+        ''' Return a previously loaded indirect object, or create
+            a placeholder for it.
+        '''
+        key = int(objnum), int(gennum)
+        result = self.indirect_objects.get(key)
+        if result is None:
+            self.indirect_objects[key] = result = PdfIndirect(key)
+            self.deferred_objects.add(key)
+            result._loader = self.loadindirect
+        return result
+
+    def readarray(self, source, PdfArray=PdfArray):
+        ''' Found a [ token.  Parse the tokens after that.
+        '''
+        specialget = self.special.get
+        result = []
+        pop = result.pop
+        append = result.append
+
+        for value in source:
+            if value in ']R':
+                if value == ']':
+                    break
+                generation = pop()
+                value = self.findindirect(pop(), generation)
+            else:
+                func = specialget(value)
+                if func is not None:
+                    value = func(source)
+            append(value)
+        return PdfArray(result)
+
+    def readdict(self, source, PdfDict=PdfDict):
+        ''' Found a << token.  Parse the tokens after that.
+        '''
+        specialget = self.special.get
+        result = PdfDict()
+        next = source.next
+
+        tok = next()
+        while tok != '>>':
+            if not tok.startswith('/'):
+                source.error('Expected PDF /name object')
+                tok = next()
+                continue
+            key = tok
+            value = next()
+            func = specialget(value)
+            if func is not None:
+                value = func(source)
+                tok = next()
+            else:
+                tok = next()
+                if value.isdigit() and tok.isdigit():
+                    tok2 = next()
+                    if tok2 != 'R':
+                        source.error('Expected "R" following two integers')
+                        tok = tok2
+                        continue
+                    value = self.findindirect(value, tok)
+                    tok = next()
+            result[key] = value
+        return result
+
+    def empty_obj(self, source, PdfObject=PdfObject):
+        ''' Some silly git put an empty object in the
+            file.  Back up so the caller sees the endobj.
+        '''
+        source.floc = source.tokstart
+
+    def badtoken(self, source):
+        ''' Didn't see that coming.
+        '''
+        source.exception('Unexpected delimiter')
+
+    def findstream(self, obj, tok, source, len=len):
+        ''' Figure out if there is a content stream
+            following an object, and return the start
+            pointer to the content stream if so.
+
+            (We can't read it yet, because we might not
+            know how long it is, because Length might
+            be an indirect object.)
+        '''
+
+        fdata = source.fdata
+        startstream = source.tokstart + len(tok)
+        gotcr = fdata[startstream] == '\r'
+        startstream += gotcr
+        gotlf = fdata[startstream] == '\n'
+        startstream += gotlf
+        if not gotlf:
+            if not gotcr:
+                source.error(r'stream keyword not followed by \n')
+            else:
+                source.warning(r"stream keyword terminated "
+                               r"by \r without \n")
+        return startstream
+
+    def readstream(self, obj, startstream, source, exact_required=False,
+                   streamending='endstream endobj'.split(), int=int):
+        fdata = source.fdata
+        length = int(obj.Length)
+        source.floc = target_endstream = startstream + length
+        endit = source.multiple(2)
+        obj._stream = fdata[startstream:target_endstream]
+        if endit == streamending:
+            return
+
+        if exact_required:
+            source.exception('Expected endstream endobj')
+
+        # The length attribute does not match the distance between the
+        # stream and endstream keywords.
+
+        # TODO:  Extract maxstream from dictionary of object offsets
+        # and use rfind instead of find.
+        maxstream = len(fdata) - 20
+        endstream = fdata.find('endstream', startstream, maxstream)
+        source.floc = startstream
+        room = endstream - startstream
+        if endstream < 0:
+            source.error('Could not find endstream')
+            return
+        if (length == room + 1 and
+                fdata[startstream - 2:startstream] == '\r\n'):
+            source.warning(r"stream keyword terminated by \r without \n")
+            obj._stream = fdata[startstream - 1:target_endstream - 1]
+            return
+        source.floc = endstream
+        if length > room:
+            source.error('stream /Length attribute (%d) appears to '
+                         'be too big (size %d) -- adjusting',
+                         length, room)
+            obj.stream = fdata[startstream:endstream]
+            return
+        if fdata[target_endstream:endstream].rstrip():
+            source.error('stream /Length attribute (%d) appears to '
+                         'be too small (size %d) -- adjusting',
+                         length, room)
+            obj.stream = fdata[startstream:endstream]
+            return
+        endobj = fdata.find('endobj', endstream, maxstream)
+        if endobj < 0:
+            source.error('Could not find endobj after endstream')
+            return
+        if fdata[endstream:endobj].rstrip() != 'endstream':
+            source.error('Unexpected data between endstream and endobj')
+            return
+        source.error('Illegal endstream/endobj combination')
+
+    def loadindirect(self, key, PdfDict=PdfDict,
+                     isinstance=isinstance):
+        result = self.indirect_objects.get(key)
+        if not isinstance(result, PdfIndirect):
+            return result
+        source = self.source
+        offset = int(self.source.obj_offsets.get(key, '0'))
+        if not offset:
+            source.warning("Did not find PDF object %s", key)
+            return None
+
+        # Read the object header and validate it
+        objnum, gennum = key
+        source.floc = offset
+        objid = source.multiple(3)
+        ok = len(objid) == 3
+        ok = ok and objid[0].isdigit() and int(objid[0]) == objnum
+        ok = ok and objid[1].isdigit() and int(objid[1]) == gennum
+        ok = ok and objid[2] == 'obj'
+        if not ok:
+            source.floc = offset
+            source.next()
+            objheader = '%d %d obj' % (objnum, gennum)
+            fdata = source.fdata
+            offset2 = (fdata.find('\n' + objheader) + 1 or
+                       fdata.find('\r' + objheader) + 1)
+            if (not offset2 or
+                    fdata.find(fdata[offset2 - 1] + objheader, offset2) > 0):
+                source.warning("Expected indirect object '%s'", objheader)
+                return None
+            source.warning("Indirect object %s found at incorrect "
+                           "offset %d (expected offset %d)",
+                           objheader, offset2, offset)
+            source.floc = offset2 + len(objheader)
+
+        # Read the object, and call special code if it starts
+        # an array or dictionary
+        obj = source.next()
+        func = self.special.get(obj)
+        if func is not None:
+            obj = func(source)
+
+        self.indirect_objects[key] = obj
+        self.deferred_objects.remove(key)
+
+        # Mark the object as indirect, and
+        # just return it if it is a simple object.
+        obj.indirect = key
+        tok = source.next()
+        if tok == 'endobj':
+            return obj
+
+        # Should be a stream.  Either that or it's broken.
+        isdict = isinstance(obj, PdfDict)
+        if isdict and tok == 'stream':
+            self.readstream(obj, self.findstream(obj, tok, source), source)
+            return obj
+
+        # Houston, we have a problem, but let's see if it
+        # is easily fixable.  Leaving out a space before endobj
+        # is apparently an easy mistake to make on generation
+        # (Because it won't be noticed unless you are specifically
+        # generating an indirect object that doesn't end with any
+        # sort of delimiter.)  It is so common that things like
+        # okular just handle it.
+
+        if isinstance(obj, PdfObject) and obj.endswith('endobj'):
+            source.error('No space or delimiter before endobj')
+            obj = PdfObject(obj[:-6])
+        else:
+            source.error("Expected 'endobj'%s token",
+                         isdict and " or 'stream'" or '')
+            obj = PdfObject('')
+
+        obj.indirect = key
+        self.indirect_objects[key] = obj
+        return obj
+
+    def read_all(self):
+        deferred = self.deferred_objects
+        prev = set()
+        while 1:
+            new = deferred - prev
+            if not new:
+                break
+            prev |= deferred
+            for key in new:
+                self.loadindirect(key)
+
+    def decrypt_all(self):
+        self.read_all()
+
+        if self.crypt_filters is not None:
+            crypt.decrypt_objects(
+                self.indirect_objects.values(), self.stream_crypt_filter,
+                self.crypt_filters)
+
+    def uncompress(self):
+        self.read_all()
+
+        uncompress(self.indirect_objects.values())
+
+    def load_stream_objects(self, object_streams):
+        # read object streams
+        objs = []
+        for num in object_streams:
+            obj = self.findindirect(num, 0).real_value()
+            assert obj.Type == '/ObjStm'
+            objs.append(obj)
+
+        # read objects from stream
+        if objs:
+            # Decrypt
+            if self.crypt_filters is not None:
+                crypt.decrypt_objects(
+                    objs, self.stream_crypt_filter, self.crypt_filters)
+
+            # Decompress
+            uncompress(objs)
+
+            for obj in objs:
+                objsource = PdfTokens(obj.stream, 0, False)
+                next = objsource.next
+                offsets = []
+                firstoffset = int(obj.First)
+                while objsource.floc < firstoffset:
+                    offsets.append((int(next()), firstoffset + int(next())))
+                for num, offset in offsets:
+                    # Read the object, and call special code if it starts
+                    # an array or dictionary
+                    objsource.floc = offset
+                    sobj = next()
+                    func = self.special.get(sobj)
+                    if func is not None:
+                        sobj = func(objsource)
+
+                    key = (num, 0)
+                    self.indirect_objects[key] = sobj
+                    if key in self.deferred_objects:
+                        self.deferred_objects.remove(key)
+
+                    # Mark the object as indirect, and
+                    # add it to the list of streams if it starts a stream
+                    sobj.indirect = key
+
+    def findxref(self, fdata):
+        ''' Find the cross reference section at the end of a file
+        '''
+        startloc = fdata.rfind('startxref')
+        if startloc < 0:
+            raise PdfParseError('Did not find "startxref" at end of file')
+        source = PdfTokens(fdata, startloc, False, self.verbose)
+        tok = source.next()
+        assert tok == 'startxref'  # (We just checked this...)
+        tableloc = source.next_default()
+        if not tableloc.isdigit():
+            source.exception('Expected table location')
+        if source.next_default().rstrip().lstrip('%') != 'EOF':
+            source.exception('Expected %%EOF')
+        return startloc, PdfTokens(fdata, int(tableloc), True, self.verbose)
+
+    def parse_xref_stream(self, source, int=int, range=range,
+                          enumerate=enumerate, islice=itertools.islice,
+                          defaultdict=collections.defaultdict,
+                          hexlify=binascii.hexlify):
+        ''' Parse (one of) the cross-reference file section(s)
+        '''
+
+        def readint(s, lengths):
+            offset = 0
+            for length in itertools.cycle(lengths):
+                next = offset + length
+                yield int(hexlify(s[offset:next]), 16) if length else None
+                offset = next
+
+        setdefault = source.obj_offsets.setdefault
+        next = source.next
+        # check for xref stream object
+        objid = source.multiple(3)
+        ok = len(objid) == 3
+        ok = ok and objid[0].isdigit()
+        ok = ok and objid[1] == 'obj'
+        ok = ok and objid[2] == '<<'
+        if not ok:
+            source.exception('Expected xref stream start')
+        obj = self.readdict(source)
+        if obj.Type != PdfName.XRef:
+            source.exception('Expected dict type of /XRef')
+        tok = next()
+        self.readstream(obj, self.findstream(obj, tok, source), source, True)
+        old_strm = obj.stream
+        if not uncompress([obj], True):
+            source.exception('Could not decompress Xref stream')
+        stream = obj.stream
+        # Fix for issue #76 -- goofy compressed xref stream
+        # that is NOT ACTUALLY COMPRESSED
+        stream = stream if stream is not old_strm else convert_store(old_strm)
+        num_pairs = obj.Index or PdfArray(['0', obj.Size])
+        num_pairs = [int(x) for x in num_pairs]
+        num_pairs = zip(num_pairs[0::2], num_pairs[1::2])
+        entry_sizes = [int(x) for x in obj.W]
+        if len(entry_sizes) != 3:
+            source.exception('Invalid entry size')
+        object_streams = defaultdict(list)
+        get = readint(stream, entry_sizes)
+        for objnum, size in num_pairs:
+            for cnt in range(size):
+                xtype, p1, p2 = islice(get, 3)
+                if xtype in (1, None):
+                    if p1:
+                        setdefault((objnum, p2 or 0), p1)
+                elif xtype == 2:
+                    object_streams[p1].append((objnum, p2))
+                objnum += 1
+
+        obj.private.object_streams = object_streams
+        return obj
+
+    def parse_xref_table(self, source, int=int, range=range):
+        ''' Parse (one of) the cross-reference file section(s)
+        '''
+        setdefault = source.obj_offsets.setdefault
+        next = source.next
+        # plain xref table
+        start = source.floc
+        try:
+            while 1:
+                tok = next()
+                if tok == 'trailer':
+                    return
+                startobj = int(tok)
+                for objnum in range(startobj, startobj + int(next())):
+                    offset = int(next())
+                    generation = int(next())
+                    inuse = next()
+                    if inuse == 'n':
+                        if offset != 0:
+                            setdefault((objnum, generation), offset)
+                    elif inuse != 'f':
+                        raise ValueError
+        except:
+            pass
+        try:
+            # Table formatted incorrectly.
+            # See if we can figure it out anyway.
+            end = source.fdata.rindex('trailer', start)
+            table = source.fdata[start:end].splitlines()
+            for line in table:
+                tokens = line.split()
+                if len(tokens) == 2:
+                    objnum = int(tokens[0])
+                elif len(tokens) == 3:
+                    offset, generation, inuse = (int(tokens[0]),
+                                                 int(tokens[1]), tokens[2])
+                    if offset != 0 and inuse == 'n':
+                        setdefault((objnum, generation), offset)
+                    objnum += 1
+                elif tokens:
+                    log.error('Invalid line in xref table: %s' %
+                              repr(line))
+                    raise ValueError
+            log.warning('Badly formatted xref table')
+            source.floc = end
+            next()
+        except:
+            source.floc = start
+            source.exception('Invalid table format')
+
+    def parsexref(self, source):
+        ''' Parse (one of) the cross-reference file section(s)
+        '''
+        next = source.next
+        try:
+            tok = next()
+        except StopIteration:
+            tok = ''
+        if tok.isdigit():
+            return self.parse_xref_stream(source), True
+        elif tok == 'xref':
+            self.parse_xref_table(source)
+            tok = next()
+            if tok != '<<':
+                source.exception('Expected "<<" starting catalog')
+            return self.readdict(source), False
+        else:
+            source.exception('Expected "xref" keyword or xref stream object')
+
+    def readpages(self, node):
+        pagename = PdfName.Page
+        pagesname = PdfName.Pages
+        catalogname = PdfName.Catalog
+        typename = PdfName.Type
+        kidname = PdfName.Kids
+
+        try:
+            result = []
+            stack = [node]
+            append = result.append
+            pop = stack.pop
+            while stack:
+                node = pop()
+                nodetype = node[typename]
+                if nodetype == pagename:
+                    append(node)
+                elif nodetype == pagesname:
+                    stack.extend(reversed(node[kidname]))
+                elif nodetype == catalogname:
+                    stack.append(node[pagesname])
+                else:
+                    log.error('Expected /Page or /Pages dictionary, got %s' %
+                            repr(node))
+            return result
+        except (AttributeError, TypeError) as s:
+            log.error('Invalid page tree: %s' % s)
+            return []
+
+    def _parse_encrypt_info(self, source, password, trailer):
+        """Check password and initialize crypt filters."""
+        # Create and check password key
+        key = crypt.create_key(password, trailer)
+
+        if not crypt.check_user_password(key, trailer):
+            source.warning('User password does not validate')
+
+        # Create default crypt filters
+        private = self.private
+        crypt_filters = self.crypt_filters
+        version = int(trailer.Encrypt.V or 0)
+        if version in (1, 2):
+            crypt_filter = crypt.RC4CryptFilter(key)
+            private.stream_crypt_filter = crypt_filter
+            private.string_crypt_filter = crypt_filter
+        elif version == 4:
+            if PdfName.CF in trailer.Encrypt:
+                for name, params in iteritems(trailer.Encrypt.CF):
+                    if name == PdfName.Identity:
+                        continue
+
+                    cfm = params.CFM
+                    if cfm == PdfName.AESV2:
+                        crypt_filters[name] = crypt.AESCryptFilter(key)
+                    elif cfm == PdfName.V2:
+                        crypt_filters[name] = crypt.RC4CryptFilter(key)
+                    else:
+                        source.warning(
+                            'Unsupported crypt filter: {}, {}'.format(
+                                name, cfm))
+
+            # Read default stream filter
+            if PdfName.StmF in trailer.Encrypt:
+                name = trailer.Encrypt.StmF
+                if name in crypt_filters:
+                    private.stream_crypt_filter = crypt_filters[name]
+                else:
+                    source.warning(
+                        'Invalid crypt filter name in /StmF:'
+                        ' {}'.format(name))
+
+            # Read default string filter
+            if PdfName.StrF in trailer.Encrypt:
+                name = trailer.Encrypt.StrF
+                if name in crypt_filters:
+                    private.string_crypt_filter = crypt_filters[name]
+                else:
+                    source.warning(
+                        'Invalid crypt filter name in /StrF:'
+                        ' {}'.format(name))
+        else:
+            source.warning(
+                'Unsupported Encrypt version: {}'.format(version))
+
+    def __init__(self, fname=None, fdata=None, decompress=False,
+                 decrypt=False, password='', disable_gc=True, verbose=True):
+        self.private.verbose = verbose
+
+        # Runs a lot faster with GC off.
+        disable_gc = disable_gc and gc.isenabled()
+        if disable_gc:
+            gc.disable()
+
+        try:
+            if fname is not None:
+                assert fdata is None
+                # Allow reading preexisting streams like pyPdf
+                if hasattr(fname, 'read'):
+                    fdata = fname.read()
+                else:
+                    try:
+                        f = open(fname, 'rb')
+                        fdata = f.read()
+                        f.close()
+                    except IOError:
+                        raise PdfParseError('Could not read PDF file %s' %
+                                            fname)
+
+            assert fdata is not None
+            fdata = convert_load(fdata)
+
+            if not fdata.startswith('%PDF-'):
+                startloc = fdata.find('%PDF-')
+                if startloc >= 0:
+                    log.warning('PDF header not at beginning of file')
+                else:
+                    lines = fdata.lstrip().splitlines()
+                    if not lines:
+                        raise PdfParseError('Empty PDF file!')
+                    raise PdfParseError('Invalid PDF header: %s' %
+                                        repr(lines[0]))
+
+            self.private.version = fdata[5:8]
+
+            endloc = fdata.rfind('%EOF')
+            if endloc < 0:
+                raise PdfParseError('EOF mark not found: %s' %
+                                    repr(fdata[-20:]))
+            endloc += 6
+            junk = fdata[endloc:]
+            fdata = fdata[:endloc]
+            if junk.rstrip('\00').strip():
+                log.warning('Extra data at end of file')
+
+            private = self.private
+            private.indirect_objects = {}
+            private.deferred_objects = set()
+            private.special = {'<<': self.readdict,
+                               '[': self.readarray,
+                               'endobj': self.empty_obj,
+                               }
+            for tok in r'\ ( ) < > { } ] >> %'.split():
+                self.special[tok] = self.badtoken
+
+            startloc, source = self.findxref(fdata)
+            private.source = source
+
+            # Find all the xref tables/streams, and
+            # then deal with them backwards.
+            xref_list = []
+            while 1:
+                source.obj_offsets = {}
+                trailer, is_stream = self.parsexref(source)
+                prev = trailer.Prev
+                if prev is None:
+                    token = source.next()
+                    if token != 'startxref' and not xref_list:
+                        source.warning('Expected "startxref" '
+                                       'at end of xref table')
+                    break
+                xref_list.append((source.obj_offsets, trailer, is_stream))
+                source.floc = int(prev)
+
+            # Handle document encryption
+            private.crypt_filters = None
+            if decrypt and PdfName.Encrypt in trailer:
+                identity_filter = crypt.IdentityCryptFilter()
+                crypt_filters = {
+                    PdfName.Identity: identity_filter
+                }
+                private.crypt_filters = crypt_filters
+                private.stream_crypt_filter = identity_filter
+                private.string_crypt_filter = identity_filter
+
+                if not crypt.HAS_CRYPTO:
+                    raise PdfParseError(
+                        'Install PyCrypto to enable encryption support')
+
+                self._parse_encrypt_info(source, password, trailer)
+
+            if is_stream:
+                self.load_stream_objects(trailer.object_streams)
+
+            while xref_list:
+                later_offsets, later_trailer, is_stream = xref_list.pop()
+                source.obj_offsets.update(later_offsets)
+                if is_stream:
+                    trailer.update(later_trailer)
+                    self.load_stream_objects(later_trailer.object_streams)
+                else:
+                    trailer = later_trailer
+
+            trailer.Prev = None
+
+            if (trailer.Version and
+                    float(trailer.Version) > float(self.version)):
+                self.private.version = trailer.Version
+
+            if decrypt:
+                self.decrypt_all()
+                trailer.Encrypt = None
+
+            if is_stream:
+                self.Root = trailer.Root
+                self.Info = trailer.Info
+                self.ID = trailer.ID
+                self.Size = trailer.Size
+                self.Encrypt = trailer.Encrypt
+            else:
+                self.update(trailer)
+
+            # self.read_all_indirect(source)
+            private.pages = self.readpages(self.Root)
+            if decompress:
+                self.uncompress()
+
+            # For compatibility with pyPdf
+            private.numPages = len(self.pages)
+        finally:
+            if disable_gc:
+                gc.enable()
+
+    # For compatibility with pyPdf
+    def getPage(self, pagenum):
+        return self.pages[pagenum]
--- a/pdfrw/pdfwriter.py
+++ b/pdfrw/pdfwriter.py
@ -0,0 +1,385 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+'''
+The PdfWriter class writes an entire PDF file out to disk.
+
+The writing process is not at all optimized or organized.
+
+An instance of the PdfWriter class has two methods:
+    addpage(page)
+and
+    write(fname)
+
+addpage() assumes that the pages are part of a valid
+tree/forest of PDF objects.
+'''
+import gc
+
+from .objects import (PdfName, PdfArray, PdfDict, IndirectPdfDict,
+                      PdfObject, PdfString)
+from .compress import compress as do_compress
+from .errors import PdfOutputError, log
+from .py23_diffs import iteritems, convert_store
+
+NullObject = PdfObject('null')
+NullObject.indirect = True
+NullObject.Type = 'Null object'
+
+
+def user_fmt(obj, isinstance=isinstance, float=float, str=str,
+             basestring=(type(u''), type(b'')), encode=PdfString.encode):
+    ''' This function may be replaced by the user for
+        specialized formatting requirements.
+    '''
+
+    if isinstance(obj, basestring):
+        return encode(obj)
+
+    # PDFs don't handle exponent notation
+    if isinstance(obj, float):
+            return ('%.9f' % obj).rstrip('0').rstrip('.')
+
+    return str(obj)
+
+
+def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
+                  user_fmt=user_fmt, do_compress=do_compress,
+                  convert_store=convert_store, iteritems=iteritems,
+                  id=id, isinstance=isinstance, getattr=getattr, len=len,
+                  sum=sum, set=set, str=str, hasattr=hasattr, repr=repr,
+                  enumerate=enumerate, list=list, dict=dict, tuple=tuple,
+                  PdfArray=PdfArray, PdfDict=PdfDict, PdfObject=PdfObject):
+    ''' FormatObjects performs the actual formatting and disk write.
+        Should be a class, was a class, turned into nested functions
+        for performace (to reduce attribute lookups).
+    '''
+
+    def f_write(s):
+        f.write(convert_store(s))
+
+    def add(obj):
+        ''' Add an object to our list, if it's an indirect
+            object.  Just format it if not.
+        '''
+        # Can't hash dicts, so just hash the object ID
+        objid = id(obj)
+
+        # Automatically set stream objects to indirect
+        if isinstance(obj, PdfDict):
+            indirect = obj.indirect or (obj.stream is not None)
+        else:
+            indirect = getattr(obj, 'indirect', False)
+
+        if not indirect:
+            if objid in visited:
+                log.warning('Replicating direct %s object, '
+                            'should be indirect for optimal file size' %
+                            type(obj))
+                obj = type(obj)(obj)
+                objid = id(obj)
+            visiting(objid)
+            result = format_obj(obj)
+            leaving(objid)
+            return result
+
+        objnum = indirect_dict_get(objid)
+
+        # If we haven't seen the object yet, we need to
+        # add it to the indirect object list.
+        if objnum is None:
+            swapped = swapobj(objid)
+            if swapped is not None:
+                old_id = objid
+                obj = swapped
+                objid = id(obj)
+                objnum = indirect_dict_get(objid)
+                if objnum is not None:
+                    indirect_dict[old_id] = objnum
+                    return '%s 0 R' % objnum
+            objnum = len(objlist) + 1
+            objlist_append(None)
+            indirect_dict[objid] = objnum
+            deferred.append((objnum - 1, obj))
+        return '%s 0 R' % objnum
+
+    def format_array(myarray, formatter):
+        # Format array data into semi-readable ASCII
+        if sum([len(x) for x in myarray]) <= 70:
+            return formatter % space_join(myarray)
+        return format_big(myarray, formatter)
+
+    def format_big(myarray, formatter):
+        bigarray = []
+        count = 1000000
+        for x in myarray:
+            lenx = len(x) + 1
+            count += lenx
+            if count > 71:
+                subarray = []
+                bigarray.append(subarray)
+                count = lenx
+            subarray.append(x)
+        return formatter % lf_join([space_join(x) for x in bigarray])
+
+    def format_obj(obj):
+        ''' format PDF object data into semi-readable ASCII.
+            May mutually recurse with add() -- add() will
+            return references for indirect objects, and add
+            the indirect object to the list.
+        '''
+        while 1:
+            if isinstance(obj, (list, dict, tuple)):
+                if isinstance(obj, PdfArray):
+                    myarray = [add(x) for x in obj]
+                    return format_array(myarray, '[%s]')
+                elif isinstance(obj, PdfDict):
+                    if compress and obj.stream:
+                        do_compress([obj])
+                    pairs = sorted((getattr(x, 'encoded', None) or x, y)
+                                   for (x, y) in obj.iteritems())
+                    myarray = []
+                    for key, value in pairs:
+                        myarray.append(key)
+                        myarray.append(add(value))
+                    result = format_array(myarray, '<<%s>>')
+                    stream = obj.stream
+                    if stream is not None:
+                        result = ('%s\nstream\n%s\nendstream' %
+                                  (result, stream))
+                    return result
+                obj = (PdfArray, PdfDict)[isinstance(obj, dict)](obj)
+                continue
+
+            # We assume that an object with an indirect
+            # attribute knows how to represent itself to us.
+            if hasattr(obj, 'indirect'):
+                return str(getattr(obj, 'encoded', None) or obj)
+            return user_fmt(obj)
+
+    def format_deferred():
+        while deferred:
+            index, obj = deferred.pop()
+            objlist[index] = format_obj(obj)
+
+    indirect_dict = {}
+    indirect_dict_get = indirect_dict.get
+    objlist = []
+    objlist_append = objlist.append
+    visited = set()
+    visiting = visited.add
+    leaving = visited.remove
+    space_join = ' '.join
+    lf_join = '\n  '.join
+
+    deferred = []
+
+    # Don't reference old catalog or pages objects --
+    # swap references to new ones.
+    type_remap = {PdfName.Catalog: trailer.Root,
+               PdfName.Pages: trailer.Root.Pages, None: trailer}.get
+    swapobj = [(objid, type_remap(obj.Type) if new_obj is None else new_obj)
+               for objid, (obj, new_obj) in iteritems(killobj)]
+    swapobj = dict((objid, obj is None and NullObject or obj)
+                   for objid, obj in swapobj).get
+
+    for objid in killobj:
+        assert swapobj(objid) is not None
+
+    # The first format of trailer gets all the information,
+    # but we throw away the actual trailer formatting.
+    format_obj(trailer)
+    # Keep formatting until we're done.
+    # (Used to recurse inside format_obj for this, but
+    #  hit system limit.)
+    format_deferred()
+    # Now we know the size, so we update the trailer dict
+    # and get the formatted data.
+    trailer.Size = PdfObject(len(objlist) + 1)
+    trailer = format_obj(trailer)
+
+    # Now we have all the pieces to write out to the file.
+    # Keep careful track of the counts while we do it so
+    # we can correctly build the cross-reference.
+
+    header = '%%PDF-%s\n%%\xe2\xe3\xcf\xd3\n' % version
+    f_write(header)
+    offset = len(header)
+    offsets = [(0, 65535, 'f')]
+    offsets_append = offsets.append
+
+    for i, x in enumerate(objlist):
+        objstr = '%s 0 obj\n%s\nendobj\n' % (i + 1, x)
+        offsets_append((offset, 0, 'n'))
+        offset += len(objstr)
+        f_write(objstr)
+
+    f_write('xref\n0 %s\n' % len(offsets))
+    for x in offsets:
+        f_write('%010d %05d %s\r\n' % x)
+    f_write('trailer\n\n%s\nstartxref\n%s\n%%%%EOF\n' % (trailer, offset))
+
+
+class PdfWriter(object):
+
+    _trailer = None
+    canonicalize = False
+    fname = None
+
+    def __init__(self, fname=None, version='1.3', compress=False, **kwargs):
+        """
+            Parameters:
+                fname -- Output file name, or file-like binary object
+                         with a write method
+                version -- PDF version to target.  Currently only 1.3
+                           supported.
+                compress -- True to do compression on output.  Currently
+                            compresses stream objects.
+        """
+
+        # Legacy support:  fname is new, was added in front
+        if fname is not None:
+            try:
+                float(fname)
+            except (ValueError, TypeError):
+                pass
+            else:
+                if version != '1.3':
+                    assert compress == False
+                    compress = version
+                version = fname
+                fname = None
+
+        self.fname = fname
+        self.version = version
+        self.compress = compress
+
+        if kwargs:
+            for name, value in iteritems(kwargs):
+                if name not in self.replaceable:
+                    raise ValueError("Cannot set attribute %s "
+                                     "on PdfWriter instance" % name)
+                setattr(self, name, value)
+
+        self.pagearray = PdfArray()
+        self.killobj = {}
+
+    def addpage(self, page):
+        self._trailer = None
+        if page.Type != PdfName.Page:
+            raise PdfOutputError('Bad /Type:  Expected %s, found %s'
+                                 % (PdfName.Page, page.Type))
+        inheritable = page.inheritable  # searches for resources
+        self.pagearray.append(
+            IndirectPdfDict(
+                page,
+                Resources=inheritable.Resources,
+                MediaBox=inheritable.MediaBox,
+                CropBox=inheritable.CropBox,
+                Rotate=inheritable.Rotate,
+            )
+        )
+
+        # Add parents in the hierarchy to objects we
+        # don't want to output
+        killobj = self.killobj
+        obj, new_obj = page, self.pagearray[-1]
+        while obj is not None:
+            objid = id(obj)
+            if objid in killobj:
+                break
+            killobj[objid] = obj, new_obj
+            obj = obj.Parent
+            new_obj = None
+        return self
+
+    addPage = addpage  # for compatibility with pyPdf
+
+    def addpages(self, pagelist):
+        for page in pagelist:
+            self.addpage(page)
+        return self
+
+    def _get_trailer(self):
+        trailer = self._trailer
+        if trailer is not None:
+            return trailer
+
+        if self.canonicalize:
+            self.make_canonical()
+
+        # Create the basic object structure of the PDF file
+        trailer = PdfDict(
+            Root=IndirectPdfDict(
+                Type=PdfName.Catalog,
+                Pages=IndirectPdfDict(
+                    Type=PdfName.Pages,
+                    Count=PdfObject(len(self.pagearray)),
+                    Kids=self.pagearray
+                )
+            )
+        )
+        # Make all the pages point back to the page dictionary and
+        # ensure they are indirect references
+        pagedict = trailer.Root.Pages
+        for page in pagedict.Kids:
+            page.Parent = pagedict
+            page.indirect = True
+        self._trailer = trailer
+        return trailer
+
+    def _set_trailer(self, trailer):
+        self._trailer = trailer
+
+    trailer = property(_get_trailer, _set_trailer)
+
+    def write(self, fname=None, trailer=None, user_fmt=user_fmt,
+              disable_gc=True):
+
+        trailer = trailer or self.trailer
+
+        # Support fname for legacy applications
+        if (fname is not None) == (self.fname is not None):
+            raise PdfOutputError(
+                "PdfWriter fname must be specified exactly once")
+
+        fname = fname or self.fname
+
+        # Dump the data.  We either have a filename or a preexisting
+        # file object.
+        preexisting = hasattr(fname, 'write')
+        f = preexisting and fname or open(fname, 'wb')
+        if disable_gc:
+            gc.disable()
+
+        try:
+            FormatObjects(f, trailer, self.version, self.compress,
+                          self.killobj, user_fmt=user_fmt)
+        finally:
+            if not preexisting:
+                f.close()
+            if disable_gc:
+                gc.enable()
+
+    def make_canonical(self):
+        ''' Canonicalizes a PDF.  Assumes everything
+            is a Pdf object already.
+        '''
+        visited = set()
+        workitems = list(self.pagearray)
+        while workitems:
+            obj = workitems.pop()
+            objid = id(obj)
+            if objid in visited:
+                continue
+            visited.add(objid)
+            obj.indirect = False
+            if isinstance(obj, (PdfArray, PdfDict)):
+                obj.indirect = True
+                if isinstance(obj, PdfArray):
+                    workitems += obj
+                else:
+                    workitems += obj.values()
+
+    replaceable = set(vars())
--- a/pdfrw/py23_diffs.py
+++ b/pdfrw/py23_diffs.py
@ -0,0 +1,53 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+# Deal with Python2/3 differences
+
+try:
+    import zlib
+except ImportError:
+    zlib = None
+
+try:
+    unicode = unicode
+except NameError:
+
+    def convert_load(s):
+        if isinstance(s, bytes):
+            return s.decode('Latin-1')
+        return s
+
+    def convert_store(s):
+        return s.encode('Latin-1')
+
+    def from_array(a):
+        return a.tobytes()
+
+else:
+
+    def convert_load(s):
+        return s
+
+    def convert_store(s):
+        return s
+
+    def from_array(a):
+        return a.tostring()
+
+nextattr, = (x for x in dir(iter([])) if 'next' in x)
+
+try:
+    iteritems = dict.iteritems
+except AttributeError:
+    iteritems = dict.items
+
+try:
+    xrange = xrange
+except NameError:
+    xrange = range
+
+try:
+    intern = intern
+except NameError:
+    from sys import intern
--- a/pdfrw/tokens.py
+++ b/pdfrw/tokens.py
@ -0,0 +1,229 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+'''
+A tokenizer for PDF streams.
+
+In general, documentation used was "PDF reference",
+sixth edition, for PDF version 1.7, dated November 2006.
+
+'''
+
+import re
+import itertools
+from .objects import PdfString, PdfObject
+from .objects.pdfname import BasePdfName
+from .errors import log, PdfParseError
+from .py23_diffs import nextattr, intern
+
+
+def linepos(fdata, loc):
+    line = fdata.count('\n', 0, loc) + 1
+    line += fdata.count('\r', 0, loc) - fdata.count('\r\n', 0, loc)
+    col = loc - max(fdata.rfind('\n', 0, loc), fdata.rfind('\r', 0, loc))
+    return line, col
+
+
+class PdfTokens(object):
+
+    # Table 3.1, page 50 of reference, defines whitespace
+    eol = '\n\r'
+    whitespace = '\x00 \t\f' + eol
+
+    # Text on page 50 defines delimiter characters
+    # Escape the ]
+    delimiters = r'()<>{}[\]/%'
+
+    # "normal" stuff is all but delimiters or whitespace.
+
+    p_normal = r'(?:[^\\%s%s]+|\\[^%s])+' % (whitespace, delimiters,
+                                             whitespace)
+
+    p_comment = r'\%%[^%s]*' % eol
+
+    # This will get the bulk of literal strings.
+    p_literal_string = r'\((?:[^\\()]+|\\.)*[()]?'
+
+    # This will get more pieces of literal strings
+    # (Don't ask me why, but it hangs without the trailing ?.)
+    p_literal_string_extend = r'(?:[^\\()]+|\\.)*[()]?'
+
+    # A hex string.  This one's easy.
+    p_hex_string = r'\<[%s0-9A-Fa-f]*\>' % whitespace
+
+    p_dictdelim = r'\<\<|\>\>'
+    p_name = r'/[^%s%s]*' % (delimiters, whitespace)
+
+    p_catchall = '[^%s]' % whitespace
+
+    pattern = '|'.join([p_normal, p_name, p_hex_string, p_dictdelim,
+                        p_literal_string, p_comment, p_catchall])
+    findtok = re.compile('(%s)[%s]*' % (pattern, whitespace),
+                         re.DOTALL).finditer
+    findparen = re.compile('(%s)[%s]*' % (p_literal_string_extend,
+                                          whitespace), re.DOTALL).finditer
+
+    def _gettoks(self, startloc, intern=intern,
+                 delimiters=delimiters, findtok=findtok,
+                 findparen=findparen, PdfString=PdfString,
+                 PdfObject=PdfObject, BasePdfName=BasePdfName):
+        ''' Given a source data string and a location inside it,
+            gettoks generates tokens.  Each token is a tuple of the form:
+             <starting file loc>, <ending file loc>, <token string>
+            The ending file loc is past any trailing whitespace.
+
+            The main complication here is the literal strings, which
+            can contain nested parentheses.  In order to cope with these
+            we can discard the current iterator and loop back to the
+            top to get a fresh one.
+
+            We could use re.search instead of re.finditer, but that's slower.
+        '''
+        fdata = self.fdata
+        current = self.current = [(startloc, startloc)]
+        cache = {}
+        get_cache = cache.get
+        while 1:
+            for match in findtok(fdata, current[0][1]):
+                current[0] = tokspan = match.span()
+                token = match.group(1)
+                firstch = token[0]
+                toktype = intern
+                if firstch not in delimiters:
+                    toktype = PdfObject
+                elif firstch in '/<(%':
+                    if firstch == '/':
+                        # PDF Name
+                        toktype = BasePdfName
+                    elif firstch == '<':
+                        # << dict delim, or < hex string >
+                        if token[1:2] != '<':
+                            toktype = PdfString
+                    elif firstch == '(':
+                        # Literal string
+                        # It's probably simple, but maybe not
+                        # Nested parentheses are a bear, and if
+                        # they are present, we exit the for loop
+                        # and get back in with a new starting location.
+                        ends = None  # For broken strings
+                        if fdata[match.end(1) - 1] != ')':
+                            nest = 2
+                            m_start, loc = tokspan
+                            for match in findparen(fdata, loc):
+                                loc = match.end(1)
+                                ending = fdata[loc - 1] == ')'
+                                nest += 1 - ending * 2
+                                if not nest:
+                                    break
+                                if ending and ends is None:
+                                    ends = loc, match.end(), nest
+                            token = fdata[m_start:loc]
+                            current[0] = m_start, match.end()
+                            if nest:
+                                # There is one possible recoverable error
+                                # seen in the wild -- some stupid generators
+                                # don't escape (.  If this happens, just
+                                # terminate on first unescaped ). The string
+                                # won't be quite right, but that's a science
+                                # fair project for another time.
+                                (self.error, self.exception)[not ends](
+                                    'Unterminated literal string')
+                                loc, ends, nest = ends
+                                token = fdata[m_start:loc] + ')' * nest
+                                current[0] = m_start, ends
+                        toktype = PdfString
+                    elif firstch == '%':
+                        # Comment
+                        if self.strip_comments:
+                            continue
+                    else:
+                        self.exception(('Tokenizer logic incorrect -- '
+                                        'should never get here'))
+
+                newtok = get_cache(token)
+                if newtok is None:
+                    newtok = cache[token] = toktype(token)
+                yield newtok
+                if current[0] is not tokspan:
+                    break
+            else:
+                if self.strip_comments:
+                    break
+                raise StopIteration
+
+    def __init__(self, fdata, startloc=0, strip_comments=True, verbose=True):
+        self.fdata = fdata
+        self.strip_comments = strip_comments
+        self.iterator = iterator = self._gettoks(startloc)
+        self.msgs_dumped = None if verbose else set()
+        self.next = getattr(iterator, nextattr)
+        self.current = [(startloc, startloc)]
+
+    def setstart(self, startloc):
+        ''' Change the starting location.
+        '''
+        current = self.current
+        if startloc != current[0][1]:
+            current[0] = startloc, startloc
+
+    def floc(self):
+        ''' Return the current file position
+            (where the next token will be retrieved)
+        '''
+        return self.current[0][1]
+    floc = property(floc, setstart)
+
+    def tokstart(self):
+        ''' Return the file position of the most
+            recently retrieved token.
+        '''
+        return self.current[0][0]
+    tokstart = property(tokstart, setstart)
+
+    def __iter__(self):
+        return self.iterator
+
+    def multiple(self, count, islice=itertools.islice, list=list):
+        ''' Retrieve multiple tokens
+        '''
+        return list(islice(self, count))
+
+    def next_default(self, default='nope'):
+        for result in self:
+            return result
+        return default
+
+    def msg(self, msg, *arg):
+        dumped = self.msgs_dumped
+        if dumped is not None:
+            if msg in dumped:
+                return
+            dumped.add(msg)
+        if arg:
+            msg %= arg
+        fdata = self.fdata
+        begin, end = self.current[0]
+        if begin >= len(fdata):
+            return '%s (filepos %s past EOF %s)' % (msg, begin, len(fdata))
+        line, col = linepos(fdata, begin)
+        if end > begin:
+            tok = fdata[begin:end].rstrip()
+            if len(tok) > 30:
+                tok = tok[:26] + ' ...'
+            return ('%s (line=%d, col=%d, token=%s)' %
+                    (msg, line, col, repr(tok)))
+        return '%s (line=%d, col=%d)' % (msg, line, col)
+
+    def warning(self, *arg):
+        s = self.msg(*arg)
+        if s:
+            log.warning(s)
+
+    def error(self, *arg):
+        s = self.msg(*arg)
+        if s:
+            log.error(s)
+
+    def exception(self, *arg):
+        raise PdfParseError(self.msg(*arg))
--- a/pdfrw/toreportlab.py
+++ b/pdfrw/toreportlab.py
@ -0,0 +1,146 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+'''
+Converts pdfrw objects into reportlab objects.
+
+Designed for and tested with rl 2.3.
+
+Knows too much about reportlab internals.
+What can you do?
+
+The interface to this function is through the makerl() function.
+
+Parameters:
+        canv       - a reportlab "canvas" (also accepts a "document")
+        pdfobj      - a pdfrw PDF object
+
+Returns:
+        A corresponding reportlab object, or if the
+        object is a PDF Form XObject, the name to
+        use with reportlab for the object.
+
+        Will recursively convert all necessary objects.
+        Be careful when converting a page -- if /Parent is set,
+        will recursively convert all pages!
+
+Notes:
+    1) Original objects are annotated with a
+        derived_rl_obj attribute which points to the
+        reportlab object.  This keeps multiple reportlab
+        objects from being generated for the same pdfobj
+        via repeated calls to makerl.  This is great for
+        not putting too many objects into the
+        new PDF, but not so good if you are modifying
+        objects for different pages.  Then you
+        need to do your own deep copying (of circular
+        structures).  You're on your own.
+
+    2) ReportLab seems weird about FormXObjects.
+       They pass around a partial name instead of the
+       object or a reference to it.  So we have to
+       reach into reportlab and get a number for
+       a unique name.  I guess this is to make it
+       where you can combine page streams with
+       impunity, but that's just a guess.
+
+    3) Updated 1/23/2010 to handle multipass documents
+       (e.g. with a table of contents).  These have
+       a different doc object on every pass.
+
+'''
+
+from reportlab.pdfbase import pdfdoc as rldocmodule
+from .objects import PdfDict, PdfArray, PdfName
+from .py23_diffs import convert_store
+
+RLStream = rldocmodule.PDFStream
+RLDict = rldocmodule.PDFDictionary
+RLArray = rldocmodule.PDFArray
+
+
+def _makedict(rldoc, pdfobj):
+    rlobj = rldict = RLDict()
+    if pdfobj.indirect:
+        rlobj.__RefOnly__ = 1
+        rlobj = rldoc.Reference(rlobj)
+    pdfobj.derived_rl_obj[rldoc] = rlobj, None
+
+    for key, value in pdfobj.iteritems():
+        rldict[key[1:]] = makerl_recurse(rldoc, value)
+
+    return rlobj
+
+
+def _makestream(rldoc, pdfobj, xobjtype=PdfName.XObject):
+    rldict = RLDict()
+    rlobj = RLStream(rldict, convert_store(pdfobj.stream))
+
+    if pdfobj.Type == xobjtype:
+        shortname = 'pdfrw_%s' % (rldoc.objectcounter + 1)
+        fullname = rldoc.getXObjectName(shortname)
+    else:
+        shortname = fullname = None
+    result = rldoc.Reference(rlobj, fullname)
+    pdfobj.derived_rl_obj[rldoc] = result, shortname
+
+    for key, value in pdfobj.iteritems():
+        rldict[key[1:]] = makerl_recurse(rldoc, value)
+
+    return result
+
+
+def _makearray(rldoc, pdfobj):
+    rlobj = rlarray = RLArray([])
+    if pdfobj.indirect:
+        rlobj.__RefOnly__ = 1
+        rlobj = rldoc.Reference(rlobj)
+    pdfobj.derived_rl_obj[rldoc] = rlobj, None
+
+    mylist = rlarray.sequence
+    for value in pdfobj:
+        mylist.append(makerl_recurse(rldoc, value))
+
+    return rlobj
+
+
+def _makestr(rldoc, pdfobj):
+    assert isinstance(pdfobj, (float, int, str)), repr(pdfobj)
+    # TODO: Add fix for float like in pdfwriter
+    return str(getattr(pdfobj, 'encoded', None) or pdfobj)
+
+
+def makerl_recurse(rldoc, pdfobj):
+    docdict = getattr(pdfobj, 'derived_rl_obj', None)
+    if docdict is not None:
+        value = docdict.get(rldoc)
+        if value is not None:
+            return value[0]
+    if isinstance(pdfobj, PdfDict):
+        if pdfobj.stream is not None:
+            func = _makestream
+        else:
+            func = _makedict
+        if docdict is None:
+            pdfobj.private.derived_rl_obj = {}
+    elif isinstance(pdfobj, PdfArray):
+        func = _makearray
+        if docdict is None:
+            pdfobj.derived_rl_obj = {}
+    else:
+        func = _makestr
+    return func(rldoc, pdfobj)
+
+
+def makerl(canv, pdfobj):
+    try:
+        rldoc = canv._doc
+    except AttributeError:
+        rldoc = canv
+    rlobj = makerl_recurse(rldoc, pdfobj)
+    try:
+        name = pdfobj.derived_rl_obj[rldoc][1]
+    except AttributeError:
+        name = None
+    return name or rlobj
--- a/pdfrw/uncompress.py
+++ b/pdfrw/uncompress.py
@ -0,0 +1,117 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# Copyright (C) 2012-2015 Nerijus Mika
+# MIT license -- See LICENSE.txt for details
+# Copyright (c) 2006, Mathieu Fenniak
+# BSD license -- see LICENSE.txt for details
+'''
+A small subset of decompression filters.  Should add more later.
+
+I believe, after looking at the code, that portions of the flate
+PNG predictor were originally transcribed from PyPDF2, which is
+probably an excellent source of additional filters.
+'''
+import array
+from .objects import PdfDict, PdfName, PdfArray
+from .errors import log
+from .py23_diffs import zlib, xrange, from_array, convert_load, convert_store
+
+
+def streamobjects(mylist, isinstance=isinstance, PdfDict=PdfDict):
+    for obj in mylist:
+        if isinstance(obj, PdfDict) and obj.stream is not None:
+            yield obj
+
+# Hack so we can import if zlib not available
+decompressobj = zlib if zlib is None else zlib.decompressobj
+
+
+def uncompress(mylist, leave_raw=False, warnings=set(),
+               flate=PdfName.FlateDecode, decompress=decompressobj,
+               isinstance=isinstance, list=list, len=len):
+    ok = True
+    for obj in streamobjects(mylist):
+        ftype = obj.Filter
+        if ftype is None:
+            continue
+        if isinstance(ftype, list) and len(ftype) == 1:
+            # todo: multiple filters
+            ftype = ftype[0]
+        parms = obj.DecodeParms or obj.DP
+        if ftype != flate:
+            msg = ('Not decompressing: cannot use filter %s'
+                   ' with parameters %s') % (repr(ftype), repr(parms))
+            if msg not in warnings:
+                warnings.add(msg)
+                log.warning(msg)
+            ok = False
+        else:
+            dco = decompress()
+            try:
+                data = dco.decompress(convert_store(obj.stream))
+            except Exception as s:
+                error = str(s)
+            else:
+                error = None
+                if isinstance(parms, PdfArray):
+                    oldparms = parms
+                    parms = PdfDict()
+                    for x in oldparms:
+                        parms.update(x)
+                if parms:
+                    predictor = int(parms.Predictor or 1)
+                    columns = int(parms.Columns or 1)
+                    colors = int(parms.Colors or 1)
+                    bpc = int(parms.BitsPerComponent or 8)
+                    if 10 <= predictor <= 15:
+                        data, error = flate_png(data, predictor, columns, colors, bpc)
+                    elif predictor != 1:
+                        error = ('Unsupported flatedecode predictor %s' %
+                                 repr(predictor))
+            if error is None:
+                assert not dco.unconsumed_tail
+                if dco.unused_data.strip():
+                    error = ('Unconsumed compression data: %s' %
+                             repr(dco.unused_data[:20]))
+            if error is None:
+                obj.Filter = None
+                obj.stream = data if leave_raw else convert_load(data)
+            else:
+                log.error('%s %s' % (error, repr(obj.indirect)))
+                ok = False
+    return ok
+
+
+def flate_png(data, predictor=1, columns=1, colors=1, bpc=8):
+    ''' PNG prediction is used to make certain kinds of data
+        more compressible.  Before the compression, each data
+        byte is either left the same, or is set to be a delta
+        from the previous byte, or is set to be a delta from
+        the previous row.  This selection is done on a per-row
+        basis, and is indicated by a compression type byte
+        prepended to each row of data.
+
+        Within more recent PDF files, it is normal to use
+        this technique for Xref stream objects, which are
+        quite regular.
+    '''
+    columnbytes = ((columns * colors * bpc) + 7) // 8
+    data = array.array('B', data)
+    rowlen = columnbytes + 1
+    if predictor == 15:
+        padding = (rowlen - len(data)) % rowlen
+        data.extend([0] * padding)
+    assert len(data) % rowlen == 0
+    rows = xrange(0, len(data), rowlen)
+    for row_index in rows:
+        offset = data[row_index]
+        if offset >= 2:
+            if offset > 2:
+                return None, 'Unsupported PNG filter %d' % offset
+            offset = rowlen if row_index else 0
+        if offset:
+            for index in xrange(row_index + 1, row_index + rowlen):
+                data[index] = (data[index] + data[index - offset]) % 256
+    for row_index in reversed(rows):
+        data.pop(row_index)
+    return from_array(data), None
--- a/releasing.txt
+++ b/releasing.txt
@ -0,0 +1,10 @@
+Notes on releasing, which is not yet fully automated:
+
+1) Update version number in pdfrw/__init__.py
+
+2) Use pyroma
+
+3) https://packaging.python.org/en/latest/distributing.html
+
+a) python setup.py sdist bdist_wheel
+b) twine upload dist/*
--- a/setup.cfg
+++ b/setup.cfg
@ -0,0 +1,5 @@
+[bdist_wheel]
+# This flag says that the code is written to work on both Python 2 and Python
+# 3. If at all possible, it is good practice to do this. If you cannot, you
+# will need to generate wheels for each Python version that you support.
+universal=1
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,40 @@
+#!/usr/bin/env python
+
+from setuptools import setup
+from pdfrw import __version__ as version
+from pdfrw.py23_diffs import convert_load
+
+setup(
+    name='pdfrw',
+    version=version,
+    description='PDF file reader/writer library',
+    long_description=convert_load(open("README.rst", 'rb').read()),
+    author='Patrick Maupin',
+    author_email='pmaupin@gmail.com',
+    platforms='Independent',
+    url='https://github.com/pmaupin/pdfrw',
+    packages=['pdfrw', 'pdfrw.objects'],
+    license='MIT',
+    classifiers=[
+        'Development Status :: 4 - Beta',
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: MIT License',
+        'Operating System :: OS Independent',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 2.6',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Topic :: Multimedia :: Graphics :: Graphics Conversion',
+        'Topic :: Software Development :: Libraries',
+        'Topic :: Text Processing',
+        'Topic :: Printing',
+        'Topic :: Utilities',
+    ],
+    keywords='pdf vector graphics PDF nup watermark split join merge',
+    zip_safe=True,
+)
--- a/tests/init.py
+++ b/tests/init.py
@ -0,0 +1 @@
+# This file intentionally left blank.
--- a/tests/checkdiffs.py
+++ b/tests/checkdiffs.py
@ -0,0 +1,81 @@
+#! /usr/bin/env python2
+
+import sys
+import os
+import subprocess
+import hashlib
+
+import expected
+import static_pdfs
+
+source_pdfs = static_pdfs.pdffiles[0]
+source_pdfs = dict((os.path.basename(x), x) for x in source_pdfs)
+
+result_dir = expected.result_dir
+
+for subdir in sorted(os.listdir(result_dir)):
+    dstd = os.path.join(result_dir, subdir)
+    if not os.path.isdir(dstd):
+        continue
+    for pdffile in sorted(os.listdir(dstd)):
+        testname = '%s/%s' % (subdir, pdffile)
+        srcf = source_pdfs.get(pdffile)
+        dstf = os.path.join(dstd, pdffile)
+        if pdffile not in source_pdfs:
+            print('\n Skipping %s -- source not found' % testname)
+            continue
+
+        with open(dstf, 'rb') as f:
+            data = f.read()
+        hash = hashlib.md5(data).hexdigest()
+        skipset = set((hash, 'skip', 'xfail', 'fail', '!' + hash))
+        if expected.results[testname] & skipset:
+            print('\n Skipping %s -- marked done' % testname)
+            continue
+        if os.path.exists('foobar.pdf'):
+            os.remove('foobar.pdf')
+        builtdiff = False
+        while 1:
+            sys.stdout.write('''
+                Test case %s
+
+                c = compare using imagemagick and okular
+                f = display foobar.pdf (result from comparison)
+                o = display results with okular
+                a = display results with acrobat
+
+                s = mark 'skip' and go to next PDF
+                g = mark as good and go to next PDF
+                b = mark as bad and go to next PDF
+                n = next pdf without marking
+                q = quit
+-->  ''' % testname)
+            sel = raw_input()
+            if sel == 'q':
+                raise SystemExit(0)
+            if sel == 'n':
+                break
+            if sel == 'c':
+                subprocess.call(('compare', '-verbose', srcf, dstf,
+                                 'foobar.pdf'))
+                builtdiff = True
+                continue
+            if sel == 'f':
+                subprocess.call(('okular', 'foobar.pdf'))
+                continue
+            if sel == 'o':
+                subprocess.call(('okular', srcf, dstf))
+                continue
+            if sel == 'a':
+                if builtdiff:
+                    subprocess.call(('acroread', srcf, dstf, 'foobar.pdf'))
+                else:
+                    subprocess.call(('acroread', srcf, dstf))
+                continue
+
+            if sel in 'sgb':
+                results = (hash if sel == 'g' else
+                           '    skip' if sel == 's' else '!'+hash)
+                with open(expected.expectedf, 'a') as f:
+                    f.write('%s %s\n' % (testname, results))
+                break
--- a/tests/expected.py
+++ b/tests/expected.py
@ -0,0 +1,41 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+'''
+    Read expected.txt, which should be in the format:
+
+       testname/srcname.pdf validhash
+
+    More than one validhash is allowed (on separate lines),
+    and hash-delimited comments are allowed.
+'''
+
+import os
+import collections
+from pdfrw.py23_diffs import convert_load
+
+root_dir = os.path.dirname(__file__)
+result_dir = 'tmp_results'
+if os.path.exists('ramdisk'):
+    result_dir = os.path.join('ramdisk', result_dir)
+result_dir = os.path.join(root_dir, result_dir)
+
+for sourcef in ('mytests.txt', 'expected.txt'):
+    expectedf = os.path.join(root_dir, sourcef)
+    if os.path.exists(expectedf):
+        break
+
+
+def results():
+    results = collections.defaultdict(set)
+    with open(expectedf, 'rb') as f:
+        for line in f:
+            line = convert_load(line)
+            line = line.split('#', 1)[0].split()
+            if not line:
+                continue
+            key, value = line
+            results[key].add(value)
+    return results
+results = results()
--- a/tests/expected.txt
+++ b/tests/expected.txt
@ -0,0 +1,225 @@
+# Example programs
+
+examples/4up_b1c400de699af29ea3f1983bb26870ab               1b73c612c40b5082d955ed72f63644bd
+examples/alter_b1c400de699af29ea3f1983bb26870ab             3c3ee465f45a685ba7098691be05a5ab
+examples/booklet_b1c400de699af29ea3f1983bb26870ab           d711b74110eefb4e9e6bf1a5bea16bfe
+examples/extract_1975ef8db7355b1d691bc79d0749574b           b4f5ee36a288da970ed040a9a733c8b0
+examples/extract_c5c895deecf7a7565393587e0d61be2b           539aad09ef80907bb396c3260eb87d7b
+examples/extract_d711b74110eefb4e9e6bf1a5bea16bfe           26ddfd09c6e6002228f06782c8544ac4
+examples/print_two_b1c400de699af29ea3f1983bb26870ab         73c8a16aba44548c2c06dae6e2551961
+examples/subset_b1c400de699af29ea3f1983bb26870ab_1-3_5      880a9578197130273ccb51265af08029
+examples/unspread_d711b74110eefb4e9e6bf1a5bea16bfe          780a9abe26a9de0b5b95ee22c4835e4b
+
+examples/cat_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c          62bb9b746ff5932d3f1b88942d36a81d
+examples/rotate_707e3e2d17cbe9ec2273414b3b63f333_270_1-4_7-8_10-50_52-56                7633ba56641115050ba098ecbef8d331
+examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c    fe2330d42b3bfc06212415f295752f0e
+examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c_-u e43e3ac0afe1cc242549424755dbf612
+
+# All these are in the poster test
+examples/subset_1975ef8db7355b1d691bc79d0749574b_21     5057f345f1a1109a0e54276a68e8f8df
+examples/rotate_5057f345f1a1109a0e54276a68e8f8df_90_1   881f4dc8dcf069e707bf61af95492d86
+examples/poster_881f4dc8dcf069e707bf61af95492d86        a34be06d22105b6c02394a9f278fec0d
+
+examples/rl1/4up_b1c400de699af29ea3f1983bb26870ab                   e21dfdd9ae56ddb261dc3d02bf6da198
+examples/rl1/booklet_b1c400de699af29ea3f1983bb26870ab               410063b7fbae1c6d5af33758e2b43450
+examples/rl1/subset_b1c400de699af29ea3f1983bb26870ab_3_5            745f1ac31a18d86afb294a449b72cb98
+examples/rl1/platypus_pdf_template_b1c400de699af29ea3f1983bb26870ab 88bd087c4dc039ced05faea3920cbec5
+
+# List things that need work here (typically cause exceptions)
+
+# Bad info dict -- works otherwise
+
+simple/b1c400de699af29ea3f1983bb26870ab.pdf         ecf2e28de18a724b53670c0d5637ec28
+repaginate/b1c400de699af29ea3f1983bb26870ab.pdf     4d7d6c5f6e14c6eac1dfc055cebfa499
+
+# 07b0ba4 is missing an object.  Best we can do is report it
+# (and we do)
+
+repaginate/07b0ba4cff1c6ff73fd468b04b013457.pdf     993c763e085bce7ecc941ba104f4c892
+simple/07b0ba4cff1c6ff73fd468b04b013457.pdf         499b9c1b1e1c76b7c5c0d5e3b62889e3
+
+#b107 has a single page, but with an empty contents dict.
+
+repaginate/b107669d1dd69eabb89765fabb2cb321.pdf     0652d2da25b50cad75863d0e2bbaa878
+simple/b107669d1dd69eabb89765fabb2cb321.pdf         56025c06ab8633575ddc6c6990d2fbf1
+
+# Encrypted files
+
+repaginate/0ae80b493bc21e6de99f2ff6bbb8bc2c.pdf     skip
+repaginate/6e122f618c27f3aa9a689423e3be6b8d.pdf     skip
+repaginate/7dc787639aa6765214e9ff5494d231ed.pdf     skip
+repaginate/b4b27aaa1f9c7c524298e98be279bebb.pdf     skip
+repaginate/b5b6c6405d7b48418bccf97277957664.pdf     skip
+repaginate/bd0ef57aec16ded45bd89d61b54af0be.pdf     skip
+repaginate/dbb807a878ac1da6b91ac15c9de4e209.pdf     skip
+simple/0ae80b493bc21e6de99f2ff6bbb8bc2c.pdf         skip
+simple/6e122f618c27f3aa9a689423e3be6b8d.pdf         skip
+simple/7dc787639aa6765214e9ff5494d231ed.pdf         skip
+simple/b4b27aaa1f9c7c524298e98be279bebb.pdf         skip
+simple/b5b6c6405d7b48418bccf97277957664.pdf         skip
+simple/bd0ef57aec16ded45bd89d61b54af0be.pdf         skip
+simple/dbb807a878ac1da6b91ac15c9de4e209.pdf         skip
+
+
+
+# List good hashes for round-trips here.
+
+repaginate/06c86654f9a77e82f9adaa0086fc391c.pdf 848966fe40a1e3de842f82700dc6d67b
+repaginate/08f69084d72dabc5dfdcf5c1ff2a719f.pdf b8c60878b0e0ce81cb6e8777038166b1
+repaginate/09715ec1a7b0f3a7ae02b3046f627b9f.pdf daf7cff9c0a15bbb347489f9fbda25f8
+repaginate/0a61de50b5ee0ea4d5d69c95dab817a3.pdf c6cd38b1131c4b856f60ebfcf51da6f5
+repaginate/1975ef8db7355b1d691bc79d0749574b.pdf 43433398ccb1edaaee734f4949a5cc3c
+repaginate/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 20dc3be2affe9082564c01b1146d7598
+repaginate/1f5dd128c3757420a881a155f2f8ace3.pdf 7130f1568526247895856806b3879db4
+repaginate/22628a7ed578b622520325673ab2a4f2.pdf e312c9c588a5ccdb1a11ac37149b178b
+repaginate/2ac7c68e26a8ef797aead15e4875cc6d.pdf e7344551183415d6257e2cab2aef4a61
+repaginate/295d26e61a85635433f8e4b768953f60.pdf a89a9fa39812ecd9fa5d6b9e785f389d
+repaginate/2d31f356c37dadd04b83ecc4e9a739a0.pdf bc04b61b41cb51f6a1c1da79fb387795
+repaginate/2fac0d9a189ca5fcef8626153d050be8.pdf 95fe3d9258ace5bdccb95a55c2c8cb22
+repaginate/319c998910453bc44d40c7748cd2cb79.pdf c0da6bf6db273bdb1385f408dcf063d0
+repaginate/35df0b8cff4afec0c08f08c6a5bc9857.pdf 3568e1c885a461b350c790ec5b729af3
+repaginate/365b9c95574ee8944370fe286905d0e8.pdf 84e5fc0d4f30ff8db05780fd244d9cf0
+repaginate/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e
+repaginate/49e31fd074eca6af981d78d42d0078ec.pdf 77fd3fa86c7c0166a373b66cfef357d2
+repaginate/536dfc6fbadd87c03eb59375d091eb53.pdf afc90878b1306483dbde37c3a50b6a45
+repaginate/569f8094597bbe5b58efc3a7c6e14e87.pdf 894bf526c0a73ab70ebfd9bf3d614315
+repaginate/5f0cff36d0ad74536a6513a98a755016.pdf 3298a3a13439764102395a34d571ff69
+repaginate/5f265db2736850782aeaba2571a3c749.pdf 2e3046813ce6e40a39bd759a3c8a3c8c
+repaginate/6a42c8c79b807bf164d31071749e07b0.pdf bf00d5e44869ae59eb859860d7d5373f
+repaginate/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 612cdd84eeac797a1c42fc91756b6d9e
+repaginate/7037a992b80b60f0294016037baa9292.pdf dd41b0104f185206b51e7ffe5b07d261
+repaginate/707e3e2d17cbe9ec2273414b3b63f333.pdf df4d756e2230c333f0c58ad354b5b51c
+repaginate/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2
+repaginate/72eb207b8f882618899aa7a65d3cecda.pdf 0b64f19a8a39fadfa2a3eec3f1a01233
+repaginate/97ba0a239cefa0dc727c2f1be050ec6c.pdf a94fe7183ce8979174b2ac16dcd9b1ea
+repaginate/9d8626d18b1d8807d271e6ffc409446a.pdf cdfcf8add1af9e612ba1a2ee06a6a273
+repaginate/9f98322c243fe67726d56ccfa8e0885b.pdf 69503ac140a1e4f1322f9350646e3dae
+repaginate/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 8cddb0f9741f7515107b1bce5dc90c83
+repaginate/c5c895deecf7a7565393587e0d61be2b.pdf 59e350c6f7d7b89fab36a4019bb526fd
+repaginate/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf 3623b7f200818c63cb6838f9678a4840
+repaginate/d6fd9567078b48c86710e9c49173781f.pdf 874b532f61139261f71afb5987dd2a68
+repaginate/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 7d3c3ae13cc7d53e7fa6ef046e15dbaa
+repaginate/ec00d5825f47b9d0faa953b1709163c3.pdf 8e6a481476c2b3bdd64ce8e36f8fe273
+repaginate/ed81787b83cc317c9f049643b853bea3.pdf 4636b68f294302417b81aaaadde1c73d
+
+
+simple/06c86654f9a77e82f9adaa0086fc391c.pdf 6e2a2e063de895d28dfea9aacb9fe469
+simple/08f69084d72dabc5dfdcf5c1ff2a719f.pdf 5a41601f6033356539e623091a3f79ef
+simple/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334
+simple/09715ec1a7b0f3a7ae02b3046f627b9f.pdf c4e4b3b725bd5fc3b008f1ac6251ad1c
+simple/1975ef8db7355b1d691bc79d0749574b.pdf 475c28c9588f3a7f6110d30f391758c4
+simple/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 3f17f19fd92adf01998bb13a0ee52b92
+simple/1f5dd128c3757420a881a155f2f8ace3.pdf b0d01f9d6ac156326aeb14b940aa73e7
+simple/22628a7ed578b622520325673ab2a4f2.pdf 1163cec415728899e997a29be465d02d
+simple/295d26e61a85635433f8e4b768953f60.pdf fe3b8960c7f877db05c7cd12c9c6e097
+simple/2ac7c68e26a8ef797aead15e4875cc6d.pdf 2623eae06eada9587574f8ddd7fc80fa
+simple/2d31f356c37dadd04b83ecc4e9a739a0.pdf 9af4794d366fbd5840836e6612ceedd2
+simple/2fac0d9a189ca5fcef8626153d050be8.pdf 458501ecda909b00262b9654f0b09ebf
+simple/319c998910453bc44d40c7748cd2cb79.pdf 8c84e36ec1db8c1dbfaa312646e000b4
+simple/35df0b8cff4afec0c08f08c6a5bc9857.pdf 0a2926c23ad916c449d5dadcfa9d38ef
+simple/365b9c95574ee8944370fe286905d0e8.pdf cf3bfac41f410bf5bd657e3f906dfbc6
+simple/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e
+simple/49e31fd074eca6af981d78d42d0078ec.pdf 2c316537a5b0917634cbbdc5b91511df
+simple/536dfc6fbadd87c03eb59375d091eb53.pdf 319851765c70ba103c4191f7ec2148db
+simple/569f8094597bbe5b58efc3a7c6e14e87.pdf 025f1bf95cc537c36b8c3a044758b86c
+simple/5f0cff36d0ad74536a6513a98a755016.pdf 8476fd75e75394fcbbe02816d0640e7d
+simple/5f265db2736850782aeaba2571a3c749.pdf d4d2e93ab22e866c86e32da84421f6f9
+simple/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013
+simple/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf fe8dd16dd7fef40338140e0610d0cbbf
+simple/7037a992b80b60f0294016037baa9292.pdf 6a2ef24e5f74dd74969ff8cefdfc6a05
+simple/707e3e2d17cbe9ec2273414b3b63f333.pdf fb6a8eb3cdc2fbef125babe8815f3b70
+simple/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2
+simple/72eb207b8f882618899aa7a65d3cecda.pdf 4ce7ff29531cc417c26389af28dc1c5e
+simple/97ba0a239cefa0dc727c2f1be050ec6c.pdf c24873bab85b8ecc7c5433d8d802bceb
+simple/9d8626d18b1d8807d271e6ffc409446a.pdf 2358d654bf20d2b9d179ab009a615c4e
+simple/9f98322c243fe67726d56ccfa8e0885b.pdf 9290b4c32f005e1e4c7f431955246c4c
+simple/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 6b406128e0ed1ac23dc5a0ee34d1f717
+simple/c5c895deecf7a7565393587e0d61be2b.pdf 2cc3c75e56d5dd562ca5b1f994bd9d5c
+simple/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf 2083f0e55cf06d88df02956a21bfef23
+simple/d6fd9567078b48c86710e9c49173781f.pdf 77464ec5cfdacb61a73b506bc4945631
+simple/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 5bc96989bc4f4b6438da953443336124
+simple/ec00d5825f47b9d0faa953b1709163c3.pdf 708f66049169c28ac39b0553908dc318
+simple/ed81787b83cc317c9f049643b853bea3.pdf c227d627217dc6808c50e80063734d27
+
+
+decompress/06c86654f9a77e82f9adaa0086fc391c.pdf 6e2a2e063de895d28dfea9aacb9fe469
+decompress/07b0ba4cff1c6ff73fd468b04b013457.pdf 499b9c1b1e1c76b7c5c0d5e3b62889e3
+decompress/08f69084d72dabc5dfdcf5c1ff2a719f.pdf ccadb859eff77d525bf86f6d821ccf1b
+decompress/09715ec1a7b0f3a7ae02b3046f627b9f.pdf 2b9c8b26a92c7645cfefa1bfa8a8ab36
+decompress/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334
+decompress/1975ef8db7355b1d691bc79d0749574b.pdf a7d5eaf0a4259352898047f284e20b90
+decompress/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 40d1cc7e26213510319b519032aff637
+decompress/1f5dd128c3757420a881a155f2f8ace3.pdf b0d01f9d6ac156326aeb14b940aa73e7
+decompress/22628a7ed578b622520325673ab2a4f2.pdf b68c7bf46ad4b70addc3369ba669dc7b
+decompress/295d26e61a85635433f8e4b768953f60.pdf 6f2ae8fb0ff853ed63537d8767ce13ad
+decompress/2ac7c68e26a8ef797aead15e4875cc6d.pdf d8d5589991ce15c834f35b340e7147a9
+decompress/2d31f356c37dadd04b83ecc4e9a739a0.pdf 5a6b732690c42f07ae6a41c37cf28ff3
+decompress/2fac0d9a189ca5fcef8626153d050be8.pdf 998366ad30becd31bed711ba78c59a7f
+decompress/319c998910453bc44d40c7748cd2cb79.pdf 7933a591caf3d49e45a42733bc48f99e
+decompress/35df0b8cff4afec0c08f08c6a5bc9857.pdf e339ae7747898d2faba270473171692a
+decompress/365b9c95574ee8944370fe286905d0e8.pdf 9da0100b5844c86e93093d0fbc78b3f6
+decompress/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e
+decompress/49e31fd074eca6af981d78d42d0078ec.pdf 4e9bf31753ff7232de4c612a31bd21fc
+decompress/536dfc6fbadd87c03eb59375d091eb53.pdf f755d2ef6052270121168d2341ad04b6
+decompress/569f8094597bbe5b58efc3a7c6e14e87.pdf aa782a7d553ec767ab61517996337f58
+decompress/5f0cff36d0ad74536a6513a98a755016.pdf 9caae4e3a21eba9e4aa76620e7508d56
+decompress/5f265db2736850782aeaba2571a3c749.pdf 836abcf6e6e1d39ad96481eb20e9b149
+decompress/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013
+decompress/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 226773cac79e1a5fed1379a0501a5df0
+decompress/7037a992b80b60f0294016037baa9292.pdf c9a3602b26d82ae145d9f5822125a158
+decompress/707e3e2d17cbe9ec2273414b3b63f333.pdf 3250a56e14a9855eccd67bb347808d24
+decompress/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2
+decompress/72eb207b8f882618899aa7a65d3cecda.pdf a4366874fb6db1d9a0c998361ea32b8d
+decompress/97ba0a239cefa0dc727c2f1be050ec6c.pdf c24873bab85b8ecc7c5433d8d802bceb
+decompress/9d8626d18b1d8807d271e6ffc409446a.pdf 6498bd354bb221516517a4c49bcb94f6
+decompress/9f98322c243fe67726d56ccfa8e0885b.pdf 4b53b63b0779b81d8f9569e66ca3d8ee
+decompress/b107669d1dd69eabb89765fabb2cb321.pdf 56025c06ab8633575ddc6c6990d2fbf1
+decompress/b1c400de699af29ea3f1983bb26870ab.pdf 08a5de62129a96d8d9a8f27052bfb227
+decompress/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 8e0eb14c12fc89e7cbb4001861d7198f
+decompress/c5c895deecf7a7565393587e0d61be2b.pdf 2cc3c75e56d5dd562ca5b1f994bd9d5c
+decompress/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf aaed7215c60dbf19bb4fefe88602196a
+decompress/d6fd9567078b48c86710e9c49173781f.pdf 1fd1b4bc184e64ea6260c30261adf9c4
+decompress/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 62b87ec47f1b93d75c32d0c78b6c2380
+decompress/ec00d5825f47b9d0faa953b1709163c3.pdf 708f66049169c28ac39b0553908dc318
+decompress/ed81787b83cc317c9f049643b853bea3.pdf 5c0a3bc5b19d58d48767bff8f31daae0
+
+compress/06c86654f9a77e82f9adaa0086fc391c.pdf b6fb771b49971f2b63a197f3ef1531aa
+compress/07b0ba4cff1c6ff73fd468b04b013457.pdf 499b9c1b1e1c76b7c5c0d5e3b62889e3
+compress/08f69084d72dabc5dfdcf5c1ff2a719f.pdf 3e7e53a92f96d52bbffe3ffa03d7b11e
+compress/09715ec1a7b0f3a7ae02b3046f627b9f.pdf 563ffde527978517393d9166b02c17d3
+compress/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334
+compress/1975ef8db7355b1d691bc79d0749574b.pdf d505caa75f8becea1a1c810f4a143976
+compress/1c2af1d2b0db6cac3c8e558a26efd38b.pdf b78f4e45aef4149a068a0225ea1be88c
+compress/1f5dd128c3757420a881a155f2f8ace3.pdf 22148c2a65129f936b8e8c67397e5bf6
+compress/22628a7ed578b622520325673ab2a4f2.pdf 54ec1fa64e64bfd146f13001444346f4
+compress/295d26e61a85635433f8e4b768953f60.pdf 2ed8eb04a8c66138883a43917cd9c0c5
+compress/2ac7c68e26a8ef797aead15e4875cc6d.pdf efe942d1e5b9f2f139c7e1f2e46ced24
+compress/2d31f356c37dadd04b83ecc4e9a739a0.pdf eedc938e6782e1d15755b5c54fffc17c
+compress/2fac0d9a189ca5fcef8626153d050be8.pdf 2d1b8e82cdc82c82bec3969acf026d30
+compress/319c998910453bc44d40c7748cd2cb79.pdf 5b9ca8444a17db8cb6fa427da7a89e44
+compress/35df0b8cff4afec0c08f08c6a5bc9857.pdf 07c064df0fc0fd0c80c4a196b4c38403
+compress/365b9c95574ee8944370fe286905d0e8.pdf 1b98e92f74c2f5324cce5fc8fbe46c15
+compress/4805fdcd7e142e8df3c04c6ba06025af.pdf 4aa2e922739ba865da30a9917ddffe8e
+compress/49e31fd074eca6af981d78d42d0078ec.pdf 7422b3d205650552ff81bc06c89c13ba
+compress/536dfc6fbadd87c03eb59375d091eb53.pdf c18b0f0f8e633fe15b17772c701a76a9
+compress/569f8094597bbe5b58efc3a7c6e14e87.pdf 3ee711f7fc678787346dca5d06ee5192
+compress/5f0cff36d0ad74536a6513a98a755016.pdf bd2a1edf6299d5dc2e1ad6b5fc8bcc20
+compress/5f265db2736850782aeaba2571a3c749.pdf bb4898beac50171de7502f13925af80c
+compress/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013
+compress/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 1c3fbae41e7cad7deca13fab93514bc7
+compress/7037a992b80b60f0294016037baa9292.pdf 9182a9765544e4a91404db65a6f951d7
+compress/707e3e2d17cbe9ec2273414b3b63f333.pdf 0e75dda73bf18d9968499277ab1a367e
+compress/71a751ce2d93a6a5d6ff21735b701fb7.pdf faa7eb31789a3789f65de30a4e58e594
+compress/72eb207b8f882618899aa7a65d3cecda.pdf 0155549fc04357220cc6be541dda7bc1
+compress/97ba0a239cefa0dc727c2f1be050ec6c.pdf 067bfee3b2bd9c250e7c4157ff543a81
+compress/9d8626d18b1d8807d271e6ffc409446a.pdf 7c124d2d0b0c7b21cce91740dfb2a8fd
+compress/9f98322c243fe67726d56ccfa8e0885b.pdf 3167fa11a3f1f4a06f90294b21e101b7
+compress/b107669d1dd69eabb89765fabb2cb321.pdf 56025c06ab8633575ddc6c6990d2fbf1
+compress/b1c400de699af29ea3f1983bb26870ab.pdf 6eaeef32b0e28959e7681c8b02d8814f
+compress/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 6ef82921011eb79a9d860214e213c868
+compress/c5c895deecf7a7565393587e0d61be2b.pdf 30d87ac6aa59d65169c389ee3badbca8
+compress/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf e4c768be930e9980c970d51d5f447e24
+compress/d6fd9567078b48c86710e9c49173781f.pdf cbc8922b8bea08928463b287767ec229
+compress/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf e893e407b3c2366d4ca822ce80b45c2c
+compress/ec00d5825f47b9d0faa953b1709163c3.pdf 9ba3db0dedec74c3d2a6f033f1b22a81
+compress/ed81787b83cc317c9f049643b853bea3.pdf 2ceda401f68a44a3fb1da4e0f9dfc578
--- a/tests/myprofile.py
+++ b/tests/myprofile.py
@ -0,0 +1,5 @@
+import cProfile
+import unittest
+import test_roundtrip
+
+cProfile.run('unittest.main(test_roundtrip)')
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@ -0,0 +1,195 @@
+#! /usr/bin/env python
+
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+'''
+Run from the directory above like so:
+
+   python -m tests.test_examples
+
+A PDF that has been determined to be good or bad
+should be added to expected.txt with either a good
+checksum, or just the word "fail".
+
+These tests are incomplete, but they allow us to try
+out various PDFs.  There is a collection of difficult
+PDFs available on github.
+
+In order to use them:
+
+  1) Insure that github.com/pmaupin/static_pdfs is on your path.
+
+  2) Use the imagemagick compare program to look at differences
+     between the static_pdfs/global directory and the tmp_results
+     directory after you run this.
+
+
+'''
+import sys
+import os
+import hashlib
+import subprocess
+import static_pdfs
+import expected
+
+from pdfrw.py23_diffs import convert_store
+from pdfrw import PdfReader, PdfWriter
+
+try:
+    import unittest2 as unittest
+except ImportError:
+    import unittest
+
+
+prog_dir = os.path.join(expected.root_dir, '..', 'examples', '%s.py')
+prog_dir = os.path.abspath(prog_dir)
+dstdir = os.path.join(expected.result_dir, 'examples')
+hashfile = os.path.join(expected.result_dir, 'hashes.txt')
+
+lookup = static_pdfs.pdffiles[0]
+lookup = dict((os.path.basename(x)[:-4], x) for x in lookup)
+
+
+class TestOnePdf(unittest.TestCase):
+
+    def do_test(self, params, prev_results=[''], scrub=False):
+        params = params.split()
+        hashkey = 'examples/%s' % '_'.join(params)
+        params = [lookup.get(x, x) for x in params]
+        progname = params[0]
+        params[0] = prog_dir % progname
+        srcf = params[1]
+        params.insert(0, sys.executable)
+        subdir, progname = os.path.split(progname)
+        subdir = os.path.join(dstdir, subdir)
+        if not os.path.exists(subdir):
+            os.makedirs(subdir)
+        os.chdir(subdir)
+        dstf = '%s.%s' % (progname, os.path.basename(srcf))
+        scrub = scrub and dstf
+        dstf = dstf if not scrub else 'final.%s' % dstf
+        hash = '------no-file-generated---------'
+        expects = expected.results[hashkey]
+
+        # If the test has been deliberately skipped,
+        # we are done.  Otherwise, execute it even
+        # if we don't know about it yet, so we have
+        # results to compare.
+
+        result = 'fail'
+        size = 0
+        try:
+            if 'skip' in expects:
+                result = 'skip requested'
+                return self.skipTest(result)
+            elif 'xfail' in expects:
+                result = 'xfail requested'
+                return self.fail(result)
+
+            exists = os.path.exists(dstf)
+            if expects or not exists:
+                if exists:
+                    os.remove(dstf)
+                if scrub and os.path.exists(scrub):
+                    os.remove(scrub)
+                subprocess.call(params)
+                if scrub:
+                    PdfWriter(dstf).addpages(PdfReader(scrub).pages).write()
+            with open(dstf, 'rb') as f:
+                data = f.read()
+            size = len(data)
+            if data:
+                hash = hashlib.md5(data).hexdigest()
+                lookup[hash] = dstf
+                prev_results[0] = hash
+            else:
+                os.remove(dstf)
+            if expects:
+                if len(expects) == 1:
+                    expects, = expects
+                    self.assertEqual(hash, expects)
+                else:
+                    self.assertIn(hash, expects)
+                result = 'pass'
+            else:
+                result = 'skip'
+                self.skipTest('No hash available')
+        finally:
+            result = '%8d %-20s %s %s\n' % (size, result, hashkey, hash)
+            with open(hashfile, 'ab') as f:
+                f.write(convert_store(result))
+
+    def test_4up(self):
+        self.do_test('4up b1c400de699af29ea3f1983bb26870ab')
+
+    def test_booklet_unspread(self):
+        prev = [None]
+        self.do_test('booklet b1c400de699af29ea3f1983bb26870ab', prev)
+        if prev[0] is not None:
+            self.do_test('unspread ' + prev[0])
+            self.do_test('extract  ' + prev[0])
+
+    def test_print_two(self):
+        self.do_test('print_two b1c400de699af29ea3f1983bb26870ab')
+
+    def test_watermarks(self):
+        self.do_test('watermark b1c400de699af29ea3f1983bb26870ab '
+                     '06c86654f9a77e82f9adaa0086fc391c')
+        self.do_test('watermark b1c400de699af29ea3f1983bb26870ab '
+                     '06c86654f9a77e82f9adaa0086fc391c -u')
+
+    def test_subset(self):
+        self.do_test('subset b1c400de699af29ea3f1983bb26870ab 1-3 5')
+
+    def test_alter(self):
+        self.do_test('alter b1c400de699af29ea3f1983bb26870ab')
+
+    def test_cat(self):
+        self.do_test('cat b1c400de699af29ea3f1983bb26870ab '
+                     '06c86654f9a77e82f9adaa0086fc391c')
+
+    def test_rotate(self):
+        self.do_test('rotate 707e3e2d17cbe9ec2273414b3b63f333 '
+                     '270 1-4 7-8 10-50 52-56')
+
+    def test_poster(self):
+        prev = [None]
+        self.do_test('subset 1975ef8db7355b1d691bc79d0749574b 21', prev)
+        self.do_test('rotate %s 90 1' % prev[0], prev)
+        self.do_test('poster %s' % prev[0], prev)
+
+    def test_extract(self):
+        self.do_test('extract 1975ef8db7355b1d691bc79d0749574b')
+        self.do_test('extract c5c895deecf7a7565393587e0d61be2b')
+
+    def test_rl1_4up(self):
+        if sys.version_info < (2, 7):
+            return
+        self.do_test('rl1/4up     b1c400de699af29ea3f1983bb26870ab',
+                     scrub=True)
+
+    def test_rl1_booklet(self):
+        if sys.version_info < (2, 7):
+            return
+        self.do_test('rl1/booklet b1c400de699af29ea3f1983bb26870ab',
+                     scrub=True)
+
+    def test_rl1_subset(self):
+        if sys.version_info < (2, 7):
+            return
+        self.do_test('rl1/subset  b1c400de699af29ea3f1983bb26870ab 3 5',
+                     scrub=True)
+
+    def test_rl1_platypus(self):
+        if sys.version_info < (2, 7):
+            return
+        self.do_test('rl1/platypus_pdf_template b1c400de699af29ea3f1983bb26870ab',
+                     scrub=True)
+
+def main():
+    unittest.main()
+
+if __name__ == '__main__':
+    main()
--- a/tests/test_pdfdict.py
+++ b/tests/test_pdfdict.py
@ -0,0 +1,39 @@
+#! /usr/bin/env python
+# encoding: utf-8
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas
+#                    2016 James Laird-Wah, Sydney, Australia
+# MIT license -- See LICENSE.txt for details
+
+'''
+Run from the directory above like so:
+python -m tests.test_pdfstring
+'''
+
+
+from pdfrw import PdfDict, PdfName
+from pdfrw.objects import PdfIndirect
+
+import unittest
+
+
+class TestPdfDicts(unittest.TestCase):
+    
+    def test_indirect_set_get(self):
+        io = PdfIndirect((1,2,3))
+        io.value = 42
+        d = PdfDict()
+        d.Name = io
+        test, = (x for x in dict.values(d))
+        self.assertEqual(test, io)
+        v = d['/Name']
+        self.assertEqual(v, io.value)
+        test, = d
+        self.assertEqual(type(test), type(PdfName.Name))
+
+def main():
+    unittest.main()
+
+
+if __name__ == '__main__':
+    main()
--- a/tests/test_pdfreader_init.py
+++ b/tests/test_pdfreader_init.py
@ -0,0 +1,28 @@
+#! /usr/bin/env python
+import static_pdfs
+
+from pdfrw import PdfReader
+
+try:
+    import unittest2 as unittest
+except ImportError:
+    import unittest
+
+
+class TestPdfReaderInit(unittest.TestCase):
+
+    def test_fname_binary_filelike(self):
+        with open(static_pdfs.pdffiles[0][0], 'rb') as pdf_file:
+            PdfReader(pdf_file)
+
+    def test_fdata_binary(self):
+        with open(static_pdfs.pdffiles[0][0], 'rb') as pdf_file:
+            pdf_bytes = pdf_file.read()
+            PdfReader(fdata=pdf_bytes)
+
+
+def main():
+    unittest.main()
+
+if __name__ == '__main__':
+    main()
--- a/tests/test_pdfstring.py
+++ b/tests/test_pdfstring.py
@ -0,0 +1,120 @@
+#! /usr/bin/env python
+# encoding: utf-8
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas
+#                    2016 James Laird-Wah, Sydney, Australia
+# MIT license -- See LICENSE.txt for details
+
+'''
+Run from the directory above like so:
+python -m tests.test_pdfstring
+'''
+
+
+from pdfrw import PdfString
+from pdfrw.py23_diffs import convert_store
+
+import unittest
+
+
+class TestBaseEncoding(unittest.TestCase):
+
+    def encode(self, value):
+        x = PdfString.encode(value)
+        if isinstance(value, type(u'')):
+            y = PdfString.from_unicode(value)
+        else:
+            y = PdfString.from_bytes(value)
+        self.assertEqual(x, y)
+        return x
+
+    def decode(self, value):
+        s = PdfString(value)
+        x = s.to_unicode()
+        y = s.decode()
+        self.assertEqual(x, y)
+        return x
+
+    def decode_bytes(self, decode_this, expected):
+        """ Decode to bytes"""
+        self.assertEqual(PdfString(decode_this).to_bytes(),
+                         convert_store(expected))
+
+    def roundtrip(self, value, expected=None):
+        result = self.encode(value)
+        self.assertEqual(value, self.decode(result))
+        if expected is not None:
+            self.assertEqual(result, expected)
+        return result
+
+    def test_doubleslash(self):
+        self.roundtrip('\\')
+        self.roundtrip(r'\\')
+
+    def test_unicode_encoding(self):
+        # These chars are in PdfDocEncoding
+        self.assertEqual(self.roundtrip(u'PDF™©®')[0], '(')
+        # These chars are not in PdfDocEncoding
+        self.assertEqual(self.roundtrip(u'δΩσ')[0], '<')
+        # Check that we're doing a reasonable encoding
+        # Might want to change this later if we change the definition of reasonable
+        self.roundtrip(u'(\n\u00FF', '(\\(\n\xff)')
+        self.roundtrip(u'(\n\u0101', '<FEFF0028000A0101>')
+
+
+    def test_constructor(self):
+        obj = PdfString('hello')
+
+    def test_continuation(self):
+        # See PDF 1.7 ref section 3.2 page 55
+        s1 = PdfString('(These two strings are the same.)')
+        self.assertEqual(s1.decode(), s1[1:-1])
+        s2 = PdfString('(These \\\ntwo strings \\\nare the same.)')
+        self.assertEqual(s1.decode(), s2.decode())
+        s2 = PdfString(s2.replace('\n', '\r'))
+        self.assertEqual(s1.decode(), s2.decode())
+        s2 = PdfString(s2.replace('\r', '\r\n'))
+        self.assertEqual(s1.decode(), s2.decode())
+
+    def test_hex_whitespace(self):
+        # See PDF 1.7 ref section 3.2 page 56
+        self.assertEqual(self.decode('<41 \n\r\t\f\v42>'), 'AB')
+
+    def test_unicode_escaped_decode(self):
+        # Some PDF producers happily put unicode strings in PdfDocEncoding,
+        # because the Unicode BOM and \0 are valid code points
+        decoded = self.decode('(\xfe\xff\0h\0e\0l\0l\0o)')
+        self.assertEqual(decoded, "hello")
+
+
+    def test_unescaping(self):
+        self.decode_bytes(r'( \( \) \\ \n \t \f \r \r\n \\n)',
+                           ' ( ) \\ \n \t \f \r \r\n \\n')
+
+        self.decode_bytes(r'(\b\010\10)', '\b\b\b')
+        self.decode_bytes('(\\n\n\\r\r\\t\t\\b\b\\f\f()\\1\\23\\0143)',
+                          '\n\n\r\r\t\t\b\b\f\f()\001\023\f3')
+        self.decode_bytes(r'(\\\nabc)', '\\\nabc')
+        self.decode_bytes(r'(\ )', ' ')
+
+    def test_BOM_variants(self):
+        self.roundtrip(u'\ufeff', '<FEFFFEFF>')
+        self.roundtrip(u'\ufffe', '<FEFFFFFE>')
+        self.roundtrip(u'\xfe\xff', '<FEFF00FE00FF>')
+        self.roundtrip(u'\xff\xfe', '(\xff\xfe)')
+        self.assertRaises(UnicodeError, PdfString.from_unicode,
+                          u'þÿ blah', text_encoding='pdfdocencoding')
+
+    def test_byte_encode(self):
+        self.assertEqual(self.encode(b'ABC'), '(ABC)')
+
+    def test_nullstring(self):
+        self.assertEqual(PdfString('<>').to_bytes(), b'')
+        self.assertEqual(PdfString('()').to_bytes(), b'')
+
+def main():
+    unittest.main()
+
+
+if __name__ == '__main__':
+    main()
--- a/tests/test_roundtrip.py
+++ b/tests/test_roundtrip.py
@ -0,0 +1,138 @@
+#! /usr/bin/env python
+
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+'''
+Run from the directory above like so:
+
+   python -m tests.test_roundtrip
+
+A PDF that has been determined to be good or bad
+should be added to expected.txt with either a good
+checksum, or just the word "fail".
+
+These tests are incomplete, but they allow us to try
+out various PDFs.  There is a collection of difficult
+PDFs available on github.
+
+In order to use them:
+
+  1) Insure that github.com/pmaupin/static_pdfs is on your path.
+
+  2) Use the imagemagick compare program to look at differences
+     between the static_pdfs/global directory and the tmp_results
+     directory after you run this.
+
+
+'''
+import os
+import hashlib
+import pdfrw
+import static_pdfs
+import expected
+
+from pdfrw.py23_diffs import convert_store
+
+try:
+    import unittest2 as unittest
+except ImportError:
+    import unittest
+
+
+class TestOnePdf(unittest.TestCase):
+
+    def roundtrip(self, testname, basename, srcf, decompress=False,
+                  compress=False, repaginate=False):
+        dstd = os.path.join(expected.result_dir, testname)
+        if not os.path.exists(dstd):
+            os.makedirs(dstd)
+        dstf = os.path.join(dstd, basename)
+        hashfile = os.path.join(expected.result_dir, 'hashes.txt')
+        hashkey = '%s/%s' % (testname, basename)
+        hash = '------no-file-generated---------'
+        expects = expected.results[hashkey]
+
+        # If the test has been deliberately skipped,
+        # we are done.  Otherwise, execute it even
+        # if we don't know about it yet, so we have
+        # results to compare.
+
+        result = 'fail'
+        size = 0
+        try:
+            if 'skip' in expects:
+                result = 'skip requested'
+                return self.skipTest(result)
+            elif 'xfail' in expects:
+                result = 'xfail requested'
+                return self.fail(result)
+
+            exists = os.path.exists(dstf)
+            if expects or not exists:
+                if exists:
+                    os.remove(dstf)
+                trailer = pdfrw.PdfReader(srcf, decompress=decompress,
+                                          verbose=False)
+                if trailer.Encrypt:
+                    result = 'skip -- encrypt'
+                    hash = '------skip-encrypt-no-file------'
+                    return self.skipTest('File encrypted')
+                writer = pdfrw.PdfWriter(dstf, compress=compress)
+                if repaginate:
+                    writer.addpages(trailer.pages)
+                else:
+                    writer.trailer = trailer
+                writer.write()
+            with open(dstf, 'rb') as f:
+                data = f.read()
+            size = len(data)
+            if data:
+                hash = hashlib.md5(data).hexdigest()
+            else:
+                os.remove(dstf)
+            if expects:
+                if len(expects) == 1:
+                    expects, = expects
+                    self.assertEqual(hash, expects)
+                else:
+                    self.assertIn(hash, expects)
+                result = 'pass'
+            else:
+                result = 'skip'
+                self.skipTest('No hash available')
+        finally:
+            result = '%8d %-20s %s %s\n' % (size, result, hashkey, hash)
+            with open(hashfile, 'ab') as f:
+                f.write(convert_store(result))
+
+
+def build_tests():
+    def test_closure(*args, **kw):
+        def test(self):
+            self.roundtrip(*args, **kw)
+        return test
+    for mytest, repaginate, decompress, compress in (
+            ('simple', False, False, False),
+            ('repaginate', True, False, False),
+            ('decompress', False, True, False),
+            ('compress', False, True, True),
+            ):
+        for srcf in static_pdfs.pdffiles[0]:
+            basename = os.path.basename(srcf)
+            test_name = 'test_%s_%s' % (mytest, basename)
+            test = test_closure(mytest, basename, srcf,
+                                repaginate=repaginate,
+                                decompress=decompress,
+                                compress=compress,
+                                )
+            setattr(TestOnePdf, test_name, test)
+build_tests()
+
+
+def main():
+    unittest.main()
+
+if __name__ == '__main__':
+    main()
--- a/tests/update_expected.py
+++ b/tests/update_expected.py
@ -0,0 +1,84 @@
+#! /usr/bin/env python2
+"""
+Put old (good) results in ramdisk/reference,
+then generate new (unknown) test results in ramdisk/tmp_results,
+THEN SWITCH BACK TO KNOWN GOOD SYSTEM, and finally:
+
+run this to update any checksums in expected.txt where both versions
+parse to same PDFs.
+"""
+
+import os
+import hashlib
+from pdfrw import PdfReader, PdfWriter, PdfArray, PdfDict, PdfObject
+
+
+def make_canonical(trailer):
+    ''' Canonicalizes a PDF.  Assumes everything
+        is a Pdf object already.
+    '''
+    visited = set()
+    workitems = list(trailer.values())
+    while workitems:
+        obj = workitems.pop()
+        objid = id(obj)
+        if objid in visited:
+            continue
+        visited.add(objid)
+        obj.indirect = True
+        if isinstance(obj, (PdfArray, PdfDict)):
+            if isinstance(obj, PdfArray):
+                workitems += obj
+            else:
+                workitems += obj.values()
+    return trailer
+
+with open('expected.txt', 'rb') as f:
+    expected = f.read()
+
+def get_digest(fname):
+        with open(fname, 'rb') as f:
+            data = f.read()
+        if data:
+            return hashlib.md5(data).hexdigest()
+
+tmp = '_temp.pdf'
+count = 0
+goodcount = 0
+
+changes = []
+for (srcpath, _, filenames) in os.walk('ramdisk/reference'):
+    for name in filenames:
+        if not name.endswith('.pdf'):
+            continue
+        src = os.path.join(srcpath, name)
+        dst = src.replace('/reference/', '/tmp_results/')
+        if not os.path.exists(dst):
+            continue
+        src_digest = get_digest(src)
+        if not src_digest or src_digest not in expected:
+            continue
+        print src
+        count += 1
+        trailer = make_canonical(PdfReader(src))
+        out = PdfWriter(tmp)
+        out.write(trailer=trailer)
+        match_digest = get_digest(tmp)
+        if not match_digest:
+            continue
+        trailer = make_canonical(PdfReader(dst))
+        out = PdfWriter(tmp)
+        out.write(trailer=trailer)
+        if get_digest(tmp) != match_digest:
+            continue
+        goodcount += 1
+        print "OK"
+        changes.append((src_digest, get_digest(dst)))
+
+print count, goodcount
+
+for stuff in changes:
+    expected = expected.replace(*stuff)
+
+with open('expected.txt', 'wb') as f:
+    f.write(expected)