Import pdfrw_0.4.orig.tar.gz
[dgit import orig pdfrw_0.4.orig.tar.gz]
This commit is contained in:
commit
5d56e870e8
|
@ -0,0 +1,67 @@
|
||||||
|
# OSX
|
||||||
|
.DS_Store
|
||||||
|
.AppleDouble
|
||||||
|
.LSOverride
|
||||||
|
Icon
|
||||||
|
|
||||||
|
# Thumbnails
|
||||||
|
._*
|
||||||
|
|
||||||
|
# Files that might appear on external disk
|
||||||
|
.Spotlight-V100
|
||||||
|
.Trashes
|
||||||
|
|
||||||
|
|
||||||
|
# Development artifacts
|
||||||
|
diffs.txt
|
||||||
|
examples/*.pdf
|
||||||
|
examples/rl*/*.pdf
|
||||||
|
tests/*.pdf
|
||||||
|
examples/pdfrw
|
||||||
|
examples/rl*/pdfrw
|
||||||
|
tests/pdfrw
|
||||||
|
tests/static_pdfs
|
||||||
|
tests/ramdisk
|
||||||
|
tests/saved_results
|
||||||
|
tests/tmp_results
|
||||||
|
wiki/
|
||||||
|
|
||||||
|
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
env/
|
||||||
|
bin/
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
lib64
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
pyvenv.cfg
|
||||||
|
pip-selfcheck.json
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.coverage
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
|
@ -0,0 +1,20 @@
|
||||||
|
language: python
|
||||||
|
python:
|
||||||
|
- "2.6"
|
||||||
|
- "2.7"
|
||||||
|
- "3.3"
|
||||||
|
- "3.4"
|
||||||
|
- "3.5"
|
||||||
|
- "3.6"
|
||||||
|
- "nightly"
|
||||||
|
# command to install dependencies
|
||||||
|
before_install:
|
||||||
|
- "git clone https://github.com/pmaupin/static_pdfs tests/static_pdfs"
|
||||||
|
install:
|
||||||
|
- "pip install ."
|
||||||
|
- "pip install reportlab || true"
|
||||||
|
- "pip install PyCrypto || true"
|
||||||
|
- "pip install zlib || true"
|
||||||
|
- "pip install unittest2 || true"
|
||||||
|
# command to run tests
|
||||||
|
script: "cd tests; /usr/bin/env PYTHONPATH=. py.test"
|
|
@ -0,0 +1,74 @@
|
||||||
|
pdfrw (github.com/pmaupin/pdfrw)
|
||||||
|
|
||||||
|
The majority of pdfrw was written by Patrick Maupin and is licensed
|
||||||
|
under the MIT license (reproduced below). Other contributors include
|
||||||
|
Attila Tajti and Nerijus Mika. It appears that some of the decompression
|
||||||
|
code was based on the decompressor from PyPDF2, which was written by
|
||||||
|
Mathieu Fenniak and licensed under the BSD license (also reproduced below).
|
||||||
|
|
||||||
|
Please add any missing authors here:
|
||||||
|
|
||||||
|
Copyright (c) 2006-2017 Patrick Maupin. All rights reserved.
|
||||||
|
Copyright (c) 2006 Mathieu Fenniak. All rights reserved.
|
||||||
|
Copyright (c) 2010 Attila Tajti. All rights reserved.
|
||||||
|
Copyright (c) 2012 Nerijus Mika. All rights reserved.
|
||||||
|
Copyright (c) 2015 Bastien Gandouet. All rights reserved.
|
||||||
|
Copyright (c) 2015 Tzerjen Wei. All rights reserved.
|
||||||
|
Copyright (c) 2015 Jorj X. McKie. All rights reserved.
|
||||||
|
Copyright (c) 2015 Nicholas Devenish. All rights reserved.
|
||||||
|
Copyright (c) 2015-2016 Jonatan Dellagostin. All rights reserved.
|
||||||
|
Copyright (c) 2016-2017 Thomas Kluyver. All rights reserved.
|
||||||
|
Copyright (c) 2016 James Laird-Wah. All rights reserved.
|
||||||
|
Copyright (c) 2016 Marcus Brinkmann. All rights reserved.
|
||||||
|
Copyright (c) 2016 Edward Betts. All rights reserved.
|
||||||
|
Copyright (c) 2016 Patrick Mazulo. All rights reserved.
|
||||||
|
Copyright (c) 2017 Haochen Wu. All rights reserved.
|
||||||
|
Copyright (c) 2017 Jon Lund Steffensen. All rights reserved.
|
||||||
|
|
||||||
|
|
||||||
|
MIT License:
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE.
|
||||||
|
|
||||||
|
|
||||||
|
BSD License:
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer in the documentation
|
||||||
|
and/or other materials provided with the distribution.
|
||||||
|
* The name of the author may not be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,3 @@
|
||||||
|
include *.txt *.in *.rst
|
||||||
|
recursive-include examples *.txt *.py
|
||||||
|
recursive-include tests *.py
|
|
@ -0,0 +1,789 @@
|
||||||
|
==================
|
||||||
|
pdfrw 0.4
|
||||||
|
==================
|
||||||
|
|
||||||
|
:Author: Patrick Maupin
|
||||||
|
|
||||||
|
.. contents::
|
||||||
|
:backlinks: none
|
||||||
|
|
||||||
|
.. sectnum::
|
||||||
|
|
||||||
|
Introduction
|
||||||
|
============
|
||||||
|
|
||||||
|
**pdfrw** is a Python library and utility that reads and writes PDF files:
|
||||||
|
|
||||||
|
* Version 0.4 is tested and works on Python 2.6, 2.7, 3.3, 3.4, 3.5, and 3.6
|
||||||
|
* Operations include subsetting, merging, rotating, modifying metadata, etc.
|
||||||
|
* The fastest pure Python PDF parser available
|
||||||
|
* Has been used for years by a printer in pre-press production
|
||||||
|
* Can be used with rst2pdf to faithfully reproduce vector images
|
||||||
|
* Can be used either standalone, or in conjunction with `reportlab`__
|
||||||
|
to reuse existing PDFs in new ones
|
||||||
|
* Permissively licensed
|
||||||
|
|
||||||
|
__ http://www.reportlab.org/
|
||||||
|
|
||||||
|
|
||||||
|
pdfrw will faithfully reproduce vector formats without
|
||||||
|
rasterization, so the rst2pdf package has used pdfrw
|
||||||
|
for PDF and SVG images by default since March 2010.
|
||||||
|
|
||||||
|
pdfrw can also be used in conjunction with reportlab, in order
|
||||||
|
to re-use portions of existing PDFs in new PDFs created with
|
||||||
|
reportlab.
|
||||||
|
|
||||||
|
|
||||||
|
Examples
|
||||||
|
=========
|
||||||
|
|
||||||
|
The library comes with several examples that show operation both with
|
||||||
|
and without reportlab.
|
||||||
|
|
||||||
|
|
||||||
|
All examples
|
||||||
|
------------------
|
||||||
|
|
||||||
|
The examples directory has a few scripts which use the library.
|
||||||
|
Note that if these examples do not work with your PDF, you should
|
||||||
|
try to use pdftk to uncompress and/or unencrypt them first.
|
||||||
|
|
||||||
|
* `4up.py`__ will shrink pages down and place 4 of them on
|
||||||
|
each output page.
|
||||||
|
* `alter.py`__ shows an example of modifying metadata, without
|
||||||
|
altering the structure of the PDF.
|
||||||
|
* `booklet.py`__ shows an example of creating a 2-up output
|
||||||
|
suitable for printing and folding (e.g on tabloid size paper).
|
||||||
|
* `cat.py`__ shows an example of concatenating multiple PDFs together.
|
||||||
|
* `extract.py`__ will extract images and Form XObjects (embedded pages)
|
||||||
|
from existing PDFs to make them easier to use and refer to from
|
||||||
|
new PDFs (e.g. with reportlab or rst2pdf).
|
||||||
|
* `poster.py`__ increases the size of a PDF so it can be printed
|
||||||
|
as a poster.
|
||||||
|
* `print_two.py`__ Allows creation of 8.5 X 5.5" booklets by slicing
|
||||||
|
8.5 X 11" paper apart after printing.
|
||||||
|
* `rotate.py`__ Rotates all or selected pages in a PDF.
|
||||||
|
* `subset.py`__ Creates a new PDF with only a subset of pages from the
|
||||||
|
original.
|
||||||
|
* `unspread.py`__ Takes a 2-up PDF, and splits out pages.
|
||||||
|
* `watermark.py`__ Adds a watermark PDF image over or under all the pages
|
||||||
|
of a PDF.
|
||||||
|
* `rl1/4up.py`__ Another 4up example, using reportlab canvas for output.
|
||||||
|
* `rl1/booklet.py`__ Another booklet example, using reportlab canvas for
|
||||||
|
output.
|
||||||
|
* `rl1/subset.py`__ Another subsetting example, using reportlab canvas for
|
||||||
|
output.
|
||||||
|
* `rl1/platypus_pdf_template.py`__ Another watermarking example, using
|
||||||
|
reportlab canvas and generated output for the document. Contributed
|
||||||
|
by user asannes.
|
||||||
|
* `rl2`__ Experimental code for parsing graphics. Needs work.
|
||||||
|
* `subset_booklets.py`__ shows an example of creating a full printable pdf
|
||||||
|
version in a more professional and pratical way ( take a look at
|
||||||
|
http://www.wikihow.com/Bind-a-Book )
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/4up.py
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/alter.py
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/booklet.py
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/cat.py
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/extract.py
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/poster.py
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/print_two.py
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/rotate.py
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/subset.py
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/unspread.py
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/watermark.py
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/4up.py
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/booklet.py
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/subset.py
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/platypus_pdf_template.py
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl2/
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/subset_booklets.py
|
||||||
|
|
||||||
|
Notes on selected examples
|
||||||
|
------------------------------------
|
||||||
|
|
||||||
|
Reorganizing pages and placing them two-up
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
A printer with a fancy printer and/or a full-up copy of Acrobat can
|
||||||
|
easily turn your small PDF into a little booklet (for example, print 4
|
||||||
|
letter-sized pages on a single 11" x 17").
|
||||||
|
|
||||||
|
But that assumes several things, including that the personnel know how
|
||||||
|
to operate the hardware and software. `booklet.py`__ lets you turn your PDF
|
||||||
|
into a preformatted booklet, to give them fewer chances to mess it up.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/booklet.py
|
||||||
|
|
||||||
|
Adding or modifying metadata
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The `cat.py`__ example will accept multiple input files on the command
|
||||||
|
line, concatenate them and output them to output.pdf, after adding some
|
||||||
|
nonsensical metadata to the output PDF file.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/cat.py
|
||||||
|
|
||||||
|
The `alter.py`__ example alters a single metadata item in a PDF,
|
||||||
|
and writes the result to a new PDF.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/alter.py
|
||||||
|
|
||||||
|
|
||||||
|
One difference is that, since **cat** is creating a new PDF structure,
|
||||||
|
and **alter** is attempting to modify an existing PDF structure, the
|
||||||
|
PDF produced by alter (and also by watermark.py) *should* be
|
||||||
|
more faithful to the original (except for the desired changes).
|
||||||
|
|
||||||
|
For example, the alter.py navigation should be left intact, whereas with
|
||||||
|
cat.py it will be stripped.
|
||||||
|
|
||||||
|
|
||||||
|
Rotating and doubling
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
If you ever want to print something that is like a small booklet, but
|
||||||
|
needs to be spiral bound, you either have to do some fancy rearranging,
|
||||||
|
or just waste half your paper.
|
||||||
|
|
||||||
|
The `print_two.py`__ example program will, for example, make two side-by-side
|
||||||
|
copies each page of of your PDF on a each output sheet.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/print_two.py
|
||||||
|
|
||||||
|
But, every other page is flipped, so that you can print double-sided and
|
||||||
|
the pages will line up properly and be pre-collated.
|
||||||
|
|
||||||
|
Graphics stream parsing proof of concept
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The `copy.py`__ script shows a simple example of reading in a PDF, and
|
||||||
|
using the decodegraphics.py module to try to write the same information
|
||||||
|
out to a new PDF through a reportlab canvas. (If you know about reportlab,
|
||||||
|
you know that if you can faithfully render a PDF to a reportlab canvas, you
|
||||||
|
can do pretty much anything else with that PDF you want.) This kind of
|
||||||
|
low level manipulation should be done only if you really need to.
|
||||||
|
decodegraphics is really more than a proof of concept than anything
|
||||||
|
else. For most cases, just use the Form XObject capability, as shown in
|
||||||
|
the examples/rl1/booklet.py demo.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl2/copy.py
|
||||||
|
|
||||||
|
pdfrw philosophy
|
||||||
|
==================
|
||||||
|
|
||||||
|
Core library
|
||||||
|
-------------
|
||||||
|
|
||||||
|
The philosophy of the library portion of pdfrw is to provide intuitive
|
||||||
|
functions to read, manipulate, and write PDF files. There should be
|
||||||
|
minimal leakage between abstraction layers, although getting useful
|
||||||
|
work done makes "pure" functionality separation difficult.
|
||||||
|
|
||||||
|
A key concept supported by the library is the use of Form XObjects,
|
||||||
|
which allow easy embedding of pieces of one PDF into another.
|
||||||
|
|
||||||
|
Addition of core support to the library is typically done carefully
|
||||||
|
and thoughtfully, so as not to clutter it up with too many special
|
||||||
|
cases.
|
||||||
|
|
||||||
|
There are a lot of incorrectly formatted PDFs floating around; support
|
||||||
|
for these is added in some cases. The decision is often based on what
|
||||||
|
acroread and okular do with the PDFs; if they can display them properly,
|
||||||
|
then eventually pdfrw should, too, if it is not too difficult or costly.
|
||||||
|
|
||||||
|
Contributions are welcome; one user has contributed some decompression
|
||||||
|
filters and the ability to process PDF 1.5 stream objects. Additional
|
||||||
|
functionality that would obviously be useful includes additional
|
||||||
|
decompression filters, the ability to process password-protected PDFs,
|
||||||
|
and the ability to output linearized PDFs.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
|
||||||
|
The philosophy of the examples is to provide small, easily-understood
|
||||||
|
examples that showcase pdfrw functionality.
|
||||||
|
|
||||||
|
|
||||||
|
PDF files and Python
|
||||||
|
======================
|
||||||
|
|
||||||
|
Introduction
|
||||||
|
------------
|
||||||
|
|
||||||
|
In general, PDF files conceptually map quite well to Python. The major
|
||||||
|
objects to think about are:
|
||||||
|
|
||||||
|
- **strings**. Most things are strings. These also often decompose
|
||||||
|
naturally into
|
||||||
|
- **lists of tokens**. Tokens can be combined to create higher-level
|
||||||
|
objects like
|
||||||
|
- **arrays** and
|
||||||
|
- **dictionaries** and
|
||||||
|
- **Contents streams** (which can be more streams of tokens)
|
||||||
|
|
||||||
|
Difficulties
|
||||||
|
------------
|
||||||
|
|
||||||
|
The apparent primary difficulty in mapping PDF files to Python is the
|
||||||
|
PDF file concept of "indirect objects." Indirect objects provide
|
||||||
|
the efficiency of allowing a single piece of data to be referred to
|
||||||
|
from more than one containing object, but probably more importantly,
|
||||||
|
indirect objects provide a way to get around the chicken and egg
|
||||||
|
problem of circular object references when mapping arbitrary data
|
||||||
|
structures to files. To flatten out a circular reference, an indirect
|
||||||
|
object is *referred to* instead of being *directly included* in another
|
||||||
|
object. PDF files have a global mechanism for locating indirect objects,
|
||||||
|
and they all have two reference numbers (a reference number and a
|
||||||
|
"generation" number, in case you wanted to append to the PDF file
|
||||||
|
rather than just rewriting the whole thing).
|
||||||
|
|
||||||
|
pdfrw automatically handles indirect references on reading in a PDF
|
||||||
|
file. When pdfrw encounters an indirect PDF file object, the
|
||||||
|
corresponding Python object it creates will have an 'indirect' attribute
|
||||||
|
with a value of True. When writing a PDF file, if you have created
|
||||||
|
arbitrary data, you just need to make sure that circular references are
|
||||||
|
broken up by putting an attribute named 'indirect' which evaluates to
|
||||||
|
True on at least one object in every cycle.
|
||||||
|
|
||||||
|
Another PDF file concept that doesn't quite map to regular Python is a
|
||||||
|
"stream". Streams are dictionaries which each have an associated
|
||||||
|
unformatted data block. pdfrw handles streams by placing a special
|
||||||
|
attribute on a subclassed dictionary.
|
||||||
|
|
||||||
|
Usage Model
|
||||||
|
-----------
|
||||||
|
|
||||||
|
The usage model for pdfrw treats most objects as strings (it takes their
|
||||||
|
string representation when writing them to a file). The two main
|
||||||
|
exceptions are the PdfArray object and the PdfDict object.
|
||||||
|
|
||||||
|
PdfArray is a subclass of list with two special features. First,
|
||||||
|
an 'indirect' attribute allows a PdfArray to be written out as
|
||||||
|
an indirect PDF object. Second, pdfrw reads files lazily, so
|
||||||
|
PdfArray knows about, and resolves references to other indirect
|
||||||
|
objects on an as-needed basis.
|
||||||
|
|
||||||
|
PdfDict is a subclass of dict that also has an indirect attribute
|
||||||
|
and lazy reference resolution as well. (And the subclassed
|
||||||
|
IndirectPdfDict has indirect automatically set True).
|
||||||
|
|
||||||
|
But PdfDict also has an optional associated stream. The stream object
|
||||||
|
defaults to None, but if you assign a stream to the dict, it will
|
||||||
|
automatically set the PDF /Length attribute for the dictionary.
|
||||||
|
|
||||||
|
Finally, since PdfDict instances are indexed by PdfName objects (which
|
||||||
|
always start with a /) and since most (all?) standard Adobe PdfName
|
||||||
|
objects use names formatted like "/CamelCase", it makes sense to allow
|
||||||
|
access to dictionary elements via object attribute accesses as well as
|
||||||
|
object index accesses. So usage of PdfDict objects is normally via
|
||||||
|
attribute access, although non-standard names (though still with a
|
||||||
|
leading slash) can be accessed via dictionary index lookup.
|
||||||
|
|
||||||
|
Reading PDFs
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The PdfReader object is a subclass of PdfDict, which allows easy access
|
||||||
|
to an entire document::
|
||||||
|
|
||||||
|
>>> from pdfrw import PdfReader
|
||||||
|
>>> x = PdfReader('source.pdf')
|
||||||
|
>>> x.keys()
|
||||||
|
['/Info', '/Size', '/Root']
|
||||||
|
>>> x.Info
|
||||||
|
{'/Producer': '(cairo 1.8.6 (http://cairographics.org))',
|
||||||
|
'/Creator': '(cairo 1.8.6 (http://cairographics.org))'}
|
||||||
|
>>> x.Root.keys()
|
||||||
|
['/Type', '/Pages']
|
||||||
|
|
||||||
|
Info, Size, and Root are retrieved from the trailer of the PDF file.
|
||||||
|
|
||||||
|
In addition to the tree structure, pdfrw creates a special attribute
|
||||||
|
named *pages*, that is a list of all the pages in the document. pdfrw
|
||||||
|
creates the *pages* attribute as a simplification for the user, because
|
||||||
|
the PDF format allows arbitrarily complicated nested dictionaries to
|
||||||
|
describe the page order. Each entry in the *pages* list is the PdfDict
|
||||||
|
object for one of the pages in the file, in order.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
>>> len(x.pages)
|
||||||
|
1
|
||||||
|
>>> x.pages[0]
|
||||||
|
{'/Parent': {'/Kids': [{...}], '/Type': '/Pages', '/Count': '1'},
|
||||||
|
'/Contents': {'/Length': '11260', '/Filter': None},
|
||||||
|
'/Resources': ... (Lots more stuff snipped)
|
||||||
|
>>> x.pages[0].Contents
|
||||||
|
{'/Length': '11260', '/Filter': None}
|
||||||
|
>>> x.pages[0].Contents.stream
|
||||||
|
'q\n1 1 1 rg /a0 gs\n0 0 0 RG 0.657436
|
||||||
|
w\n0 J\n0 j\n[] 0.0 d\n4 M q' ... (Lots more stuff snipped)
|
||||||
|
|
||||||
|
Writing PDFs
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
As you can see, it is quite easy to dig down into a PDF document. But
|
||||||
|
what about when it's time to write it out?
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
>>> from pdfrw import PdfWriter
|
||||||
|
>>> y = PdfWriter()
|
||||||
|
>>> y.addpage(x.pages[0])
|
||||||
|
>>> y.write('result.pdf')
|
||||||
|
|
||||||
|
That's all it takes to create a new PDF. You may still need to read the
|
||||||
|
`Adobe PDF reference manual`__ to figure out what needs to go *into*
|
||||||
|
the PDF, but at least you don't have to sweat actually building it
|
||||||
|
and getting the file offsets right.
|
||||||
|
|
||||||
|
__ http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf
|
||||||
|
|
||||||
|
Manipulating PDFs in memory
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
For the most part, pdfrw tries to be agnostic about the contents of
|
||||||
|
PDF files, and support them as containers, but to do useful work,
|
||||||
|
something a little higher-level is required, so pdfrw works to
|
||||||
|
understand a bit about the contents of the containers. For example:
|
||||||
|
|
||||||
|
- PDF pages. pdfrw knows enough to find the pages in PDF files you read
|
||||||
|
in, and to write a set of pages back out to a new PDF file.
|
||||||
|
- Form XObjects. pdfrw can take any page or rectangle on a page, and
|
||||||
|
convert it to a Form XObject, suitable for use inside another PDF
|
||||||
|
file. It knows enough about these to perform scaling, rotation,
|
||||||
|
and positioning.
|
||||||
|
- reportlab objects. pdfrw can recursively create a set of reportlab
|
||||||
|
objects from its internal object format. This allows, for example,
|
||||||
|
Form XObjects to be used inside reportlab, so that you can reuse
|
||||||
|
content from an existing PDF file when building a new PDF with
|
||||||
|
reportlab.
|
||||||
|
|
||||||
|
There are several examples that demonstrate these features in
|
||||||
|
the example code directory.
|
||||||
|
|
||||||
|
Missing features
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Even as a pure PDF container library, pdfrw comes up a bit short. It
|
||||||
|
does not currently support:
|
||||||
|
|
||||||
|
- Most compression/decompression filters
|
||||||
|
- encryption
|
||||||
|
|
||||||
|
`pdftk`__ is a wonderful command-line
|
||||||
|
tool that can convert your PDFs to remove encryption and compression.
|
||||||
|
However, in most cases, you can do a lot of useful work with PDFs
|
||||||
|
without actually removing compression, because only certain elements
|
||||||
|
inside PDFs are actually compressed.
|
||||||
|
|
||||||
|
__ https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/
|
||||||
|
|
||||||
|
Library internals
|
||||||
|
==================
|
||||||
|
|
||||||
|
Introduction
|
||||||
|
------------
|
||||||
|
|
||||||
|
**pdfrw** currently consists of 19 modules organized into a main
|
||||||
|
package and one sub-package.
|
||||||
|
|
||||||
|
The `__init.py__`__ module does the usual thing of importing a few
|
||||||
|
major attributes from some of the submodules, and the `errors.py`__
|
||||||
|
module supports logging and exception generation.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/__init__.py
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/errors.py
|
||||||
|
|
||||||
|
|
||||||
|
PDF object model support
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
The `objects`__ sub-package contains one module for each of the
|
||||||
|
internal representations of the kinds of basic objects that exist
|
||||||
|
in a PDF file, with the `objects/__init__.py`__ module in that
|
||||||
|
package simply gathering them up and making them available to the
|
||||||
|
main pdfrw package.
|
||||||
|
|
||||||
|
One feature that all the PDF object classes have in common is the
|
||||||
|
inclusion of an 'indirect' attribute. If 'indirect' exists and evaluates
|
||||||
|
to True, then when the object is written out, it is written out as an
|
||||||
|
indirect object. That is to say, it is addressable in the PDF file, and
|
||||||
|
could be referenced by any number (including zero) of container objects.
|
||||||
|
This indirect object capability saves space in PDF files by allowing
|
||||||
|
objects such as fonts to be referenced from multiple pages, and also
|
||||||
|
allows PDF files to contain internal circular references. This latter
|
||||||
|
capability is used, for example, when each page object has a "parent"
|
||||||
|
object in its dictionary.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/__init__.py
|
||||||
|
|
||||||
|
Ordinary objects
|
||||||
|
~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The `objects/pdfobject.py`__ module contains the PdfObject class, which is
|
||||||
|
a subclass of str, and is the catch-all object for any PDF file elements
|
||||||
|
that are not explicitly represented by other objects, as described below.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/pdfobject.py
|
||||||
|
|
||||||
|
Name objects
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The `objects/pdfname.py`__ module contains the PdfName singleton object,
|
||||||
|
which will convert a string into a PDF name by prepending a slash. It can
|
||||||
|
be used either by calling it or getting an attribute, e.g.::
|
||||||
|
|
||||||
|
PdfName.Rotate == PdfName('Rotate') == PdfObject('/Rotate')
|
||||||
|
|
||||||
|
In the example above, there is a slight difference between the objects
|
||||||
|
returned from PdfName, and the object returned from PdfObject. The
|
||||||
|
PdfName objects are actually objects of class "BasePdfName". This
|
||||||
|
is important, because only these may be used as keys in PdfDict objects.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/pdfname.py
|
||||||
|
|
||||||
|
String objects
|
||||||
|
~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The `objects/pdfstring.py`__
|
||||||
|
module contains the PdfString class, which is a subclass of str that is
|
||||||
|
used to represent encoded strings in a PDF file. The class has encode
|
||||||
|
and decode methods for the strings.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/pdfstring.py
|
||||||
|
|
||||||
|
|
||||||
|
Array objects
|
||||||
|
~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The `objects/pdfarray.py`__
|
||||||
|
module contains the PdfArray class, which is a subclass of list that is
|
||||||
|
used to represent arrays in a PDF file. A regular list could be used
|
||||||
|
instead, but use of the PdfArray class allows for an indirect attribute
|
||||||
|
to be set, and also allows for proxying of unresolved indirect objects
|
||||||
|
(that haven't been read in yet) in a manner that is transparent to pdfrw
|
||||||
|
clients.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/pdfarray.py
|
||||||
|
|
||||||
|
Dict objects
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The `objects/pdfdict.py`__
|
||||||
|
module contains the PdfDict class, which is a subclass of dict that is
|
||||||
|
used to represent dictionaries in a PDF file. A regular dict could be
|
||||||
|
used instead, but the PdfDict class matches the requirements of PDF
|
||||||
|
files more closely:
|
||||||
|
|
||||||
|
* Transparent (from the library client's viewpoint) proxying
|
||||||
|
of unresolved indirect objects
|
||||||
|
* Return of None for non-existent keys (like dict.get)
|
||||||
|
* Mapping of attribute accesses to the dict itself
|
||||||
|
(pdfdict.Foo == pdfdict[NameObject('Foo')])
|
||||||
|
* Automatic management of following stream and /Length attributes
|
||||||
|
for content dictionaries
|
||||||
|
* Indirect attribute
|
||||||
|
* Other attributes may be set for private internal use of the
|
||||||
|
library and/or its clients.
|
||||||
|
* Support for searching parent dictionaries for PDF "inheritable"
|
||||||
|
attributes.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/pdfdict.py
|
||||||
|
|
||||||
|
If a PdfDict has an associated data stream in the PDF file, the stream
|
||||||
|
is accessed via the 'stream' (all lower-case) attribute. Setting the
|
||||||
|
stream attribute on the PdfDict will automatically set the /Length attribute
|
||||||
|
as well. If that is not what is desired (for example if the the stream
|
||||||
|
is compressed), then _stream (same name with an underscore) may be used
|
||||||
|
to associate the stream with the PdfDict without setting the length.
|
||||||
|
|
||||||
|
To set private attributes (that will not be written out to a new PDF
|
||||||
|
file) on a dictionary, use the 'private' attribute::
|
||||||
|
|
||||||
|
mydict.private.foo = 1
|
||||||
|
|
||||||
|
Once the attribute is set, it may be accessed directly as an attribute
|
||||||
|
of the dictionary::
|
||||||
|
|
||||||
|
foo = mydict.foo
|
||||||
|
|
||||||
|
Some attributes of PDF pages are "inheritable." That is, they may
|
||||||
|
belong to a parent dictionary (or a parent of a parent dictionary, etc.)
|
||||||
|
The "inheritable" attribute allows for easy discovery of these::
|
||||||
|
|
||||||
|
mediabox = mypage.inheritable.MediaBox
|
||||||
|
|
||||||
|
|
||||||
|
Proxy objects
|
||||||
|
~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The `objects/pdfindirect.py`__
|
||||||
|
module contains the PdfIndirect class, which is a non-transparent proxy
|
||||||
|
object for PDF objects that have not yet been read in and resolved from
|
||||||
|
a file. Although these are non-transparent inside the library, client code
|
||||||
|
should never see one of these -- they exist inside the PdfArray and PdfDict
|
||||||
|
container types, but are resolved before being returned to a client of
|
||||||
|
those types.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/objects/pdfindirect.py
|
||||||
|
|
||||||
|
|
||||||
|
File reading, tokenization and parsing
|
||||||
|
--------------------------------------
|
||||||
|
|
||||||
|
`pdfreader.py`__
|
||||||
|
contains the PdfReader class, which can read a PDF file (or be passed a
|
||||||
|
file object or already read string) and parse it. It uses the PdfTokens
|
||||||
|
class in `tokens.py`__ for low-level tokenization.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/pdfreader.py
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/tokens.py
|
||||||
|
|
||||||
|
|
||||||
|
The PdfReader class does not, in general, parse into containers (e.g.
|
||||||
|
inside the content streams). There is a proof of concept for doing that
|
||||||
|
inside the examples/rl2 subdirectory, but that is slow and not well-developed,
|
||||||
|
and not useful for most applications.
|
||||||
|
|
||||||
|
An instance of the PdfReader class is an instance of a PdfDict -- the
|
||||||
|
trailer dictionary of the PDF file, to be exact. It will have a private
|
||||||
|
attribute set on it that is named 'pages' that is a list containing all
|
||||||
|
the pages in the file.
|
||||||
|
|
||||||
|
When instantiating a PdfReader object, there are options available
|
||||||
|
for decompressing all the objects in the file. pdfrw does not currently
|
||||||
|
have very many options for decompression, so this is not all that useful,
|
||||||
|
except in the specific case of compressed object streams.
|
||||||
|
|
||||||
|
Also, there are no options for decryption yet. If you have PDF files
|
||||||
|
that are encrypted or heavily compressed, you may find that using another
|
||||||
|
program like pdftk on them can make them readable by pdfrw.
|
||||||
|
|
||||||
|
In general, the objects are read from the file lazily, but this is not
|
||||||
|
currently true with compressed object streams -- all of these are decompressed
|
||||||
|
and read in when the PdfReader is instantiated.
|
||||||
|
|
||||||
|
|
||||||
|
File output
|
||||||
|
-----------
|
||||||
|
|
||||||
|
`pdfwriter.py`__
|
||||||
|
contains the PdfWriter class, which can create and output a PDF file.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/pdfwriter.py
|
||||||
|
|
||||||
|
There are a few options available when creating and using this class.
|
||||||
|
|
||||||
|
In the simplest case, an instance of PdfWriter is instantiated, and
|
||||||
|
then pages are added to it from one or more source files (or created
|
||||||
|
programmatically), and then the write method is called to dump the
|
||||||
|
results out to a file.
|
||||||
|
|
||||||
|
If you have a source PDF and do not want to disturb the structure
|
||||||
|
of it too badly, then you may pass its trailer directly to PdfWriter
|
||||||
|
rather than letting PdfWriter construct one for you. There is an
|
||||||
|
example of this (alter.py) in the examples directory.
|
||||||
|
|
||||||
|
|
||||||
|
Advanced features
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
`buildxobj.py`__
|
||||||
|
contains functions to build Form XObjects out of pages or rectangles on
|
||||||
|
pages. These may be reused in new PDFs essentially as if they were images.
|
||||||
|
|
||||||
|
buildxobj is careful to cache any page used so that it only appears in
|
||||||
|
the output once.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/buildxobj.py
|
||||||
|
|
||||||
|
|
||||||
|
`toreportlab.py`__
|
||||||
|
provides the makerl function, which will translate pdfrw objects into a
|
||||||
|
format which can be used with `reportlab <http://www.reportlab.org/>`__.
|
||||||
|
It is normally used in conjunction with buildxobj, to be able to reuse
|
||||||
|
parts of existing PDFs when using reportlab.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/toreportlab.py
|
||||||
|
|
||||||
|
|
||||||
|
`pagemerge.py`__ builds on the foundation laid by buildxobj. It
|
||||||
|
contains classes to create a new page (or overlay an existing page)
|
||||||
|
using one or more rectangles from other pages. There are examples
|
||||||
|
showing its use for watermarking, scaling, 4-up output, splitting
|
||||||
|
each page in 2, etc.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/pagemerge.py
|
||||||
|
|
||||||
|
`findobjs.py`__ contains code that can find specific kinds of objects
|
||||||
|
inside a PDF file. The extract.py example uses this module to create
|
||||||
|
a new PDF that places each image and Form XObject from a source PDF onto
|
||||||
|
its own page, e.g. for easy reuse with some of the other examples or
|
||||||
|
with reportlab.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/findobjs.py
|
||||||
|
|
||||||
|
|
||||||
|
Miscellaneous
|
||||||
|
----------------
|
||||||
|
|
||||||
|
`compress.py`__ and `uncompress.py`__
|
||||||
|
contains compression and decompression functions. Very few filters are
|
||||||
|
currently supported, so an external tool like pdftk might be good if you
|
||||||
|
require the ability to decompress (or, for that matter, decrypt) PDF
|
||||||
|
files.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/compress.py
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/uncompress.py
|
||||||
|
|
||||||
|
|
||||||
|
`py23_diffs.py`__ contains code to help manage the differences between
|
||||||
|
Python 2 and Python 3.
|
||||||
|
|
||||||
|
__ https://github.com/pmaupin/pdfrw/tree/master/pdfrw/py23_diffs.py
|
||||||
|
|
||||||
|
Testing
|
||||||
|
===============
|
||||||
|
|
||||||
|
The tests associated with pdfrw require a large number of PDFs,
|
||||||
|
which are not distributed with the library.
|
||||||
|
|
||||||
|
To run the tests:
|
||||||
|
|
||||||
|
* Download or clone the full package from github.com/pmaupin/pdfrw
|
||||||
|
* cd into the tests directory, and then clone the package
|
||||||
|
github.com/pmaupin/static_pdfs into a subdirectory (also named
|
||||||
|
static_pdfs).
|
||||||
|
* Now the tests may be run from that directory using unittest, or
|
||||||
|
py.test, or nose.
|
||||||
|
* travisci is used at github, and runs the tests with py.test
|
||||||
|
|
||||||
|
Other libraries
|
||||||
|
=====================
|
||||||
|
|
||||||
|
Pure Python
|
||||||
|
-----------
|
||||||
|
|
||||||
|
- `reportlab <http://www.reportlab.org/>`__
|
||||||
|
|
||||||
|
reportlab is must-have software if you want to programmatically
|
||||||
|
generate arbitrary PDFs.
|
||||||
|
|
||||||
|
- `pyPdf <https://github.com/mstamy2/PyPDF2>`__
|
||||||
|
|
||||||
|
pyPdf is, in some ways, very full-featured. It can do decompression
|
||||||
|
and decryption and seems to know a lot about items inside at least
|
||||||
|
some kinds of PDF files. In comparison, pdfrw knows less about
|
||||||
|
specific PDF file features (such as metadata), but focuses on trying
|
||||||
|
to have a more Pythonic API for mapping the PDF file container
|
||||||
|
syntax to Python, and (IMO) has a simpler and better PDF file
|
||||||
|
parser. The Form XObject capability of pdfrw means that, in many
|
||||||
|
cases, it does not actually need to decompress objects -- they
|
||||||
|
can be left compressed.
|
||||||
|
|
||||||
|
- `pdftools <http://www.boddie.org.uk/david/Projects/Python/pdftools/index.html>`__
|
||||||
|
|
||||||
|
pdftools feels large and I fell asleep trying to figure out how it
|
||||||
|
all fit together, but many others have done useful things with it.
|
||||||
|
|
||||||
|
- `pagecatcher <http://www.reportlab.com/docs/pagecatcher-ds.pdf>`__
|
||||||
|
|
||||||
|
My understanding is that pagecatcher would have done exactly what I
|
||||||
|
wanted when I built pdfrw. But I was on a zero budget, so I've never
|
||||||
|
had the pleasure of experiencing pagecatcher. I do, however, use and
|
||||||
|
like `reportlab <http://www.reportlab.org/>`__ (open source, from
|
||||||
|
the people who make pagecatcher) so I'm sure pagecatcher is great,
|
||||||
|
better documented and much more full-featured than pdfrw.
|
||||||
|
|
||||||
|
- `pdfminer <http://www.unixuser.org/~euske/python/pdfminer/index.html>`__
|
||||||
|
|
||||||
|
This looks like a useful, actively-developed program. It is quite
|
||||||
|
large, but then, it is trying to actively comprehend a full PDF
|
||||||
|
document. From the website:
|
||||||
|
|
||||||
|
"PDFMiner is a suite of programs that help extracting and analyzing
|
||||||
|
text data of PDF documents. Unlike other PDF-related tools, it
|
||||||
|
allows to obtain the exact location of texts in a page, as well as
|
||||||
|
other extra information such as font information or ruled lines. It
|
||||||
|
includes a PDF converter that can transform PDF files into other
|
||||||
|
text formats (such as HTML). It has an extensible PDF parser that
|
||||||
|
can be used for other purposes instead of text analysis."
|
||||||
|
|
||||||
|
non-pure-Python libraries
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
- `pyPoppler <https://launchpad.net/poppler-python/>`__ can read PDF
|
||||||
|
files.
|
||||||
|
- `pycairo <http://www.cairographics.org/pycairo/>`__ can write PDF
|
||||||
|
files.
|
||||||
|
- `PyMuPDF <https://github.com/rk700/PyMuPDF>`_ high performance rendering
|
||||||
|
of PDF, (Open)XPS, CBZ and EPUB
|
||||||
|
|
||||||
|
Other tools
|
||||||
|
-----------
|
||||||
|
|
||||||
|
- `pdftk <https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/>`__ is a wonderful command
|
||||||
|
line tool for basic PDF manipulation. It complements pdfrw extremely
|
||||||
|
well, supporting many operations such as decryption and decompression
|
||||||
|
that pdfrw cannot do.
|
||||||
|
- `MuPDF <http://www.mupdf.com/>`_ is a free top performance PDF, (Open)XPS, CBZ and EPUB rendering library
|
||||||
|
that also comes with some command line tools. One of those, ``mutool``, has big overlaps with pdftk's -
|
||||||
|
except it is up to 10 times faster.
|
||||||
|
|
||||||
|
Release information
|
||||||
|
=======================
|
||||||
|
|
||||||
|
Revisions:
|
||||||
|
|
||||||
|
0.4 -- Released 18 September, 2017
|
||||||
|
|
||||||
|
- Python 3.6 added to test matrix
|
||||||
|
- Proper unicode support for text strings in PDFs added
|
||||||
|
- buildxobj fixes allow better support creating form XObjects
|
||||||
|
out of compressed pages in some cases
|
||||||
|
- Compression fixes for Python 3+
|
||||||
|
- New subset_booklets.py example
|
||||||
|
- Bug with non-compressed indices into compressed object streams fixed
|
||||||
|
- Bug with distinguishing compressed object stream first objects fixed
|
||||||
|
- Better error reporting added for some invalid PDFs (e.g. when reading
|
||||||
|
past the end of file)
|
||||||
|
- Better scrubbing of old bookmark information when writing PDFs, to
|
||||||
|
remove dangling references
|
||||||
|
- Refactoring of pdfwriter, including updating API, to allow future
|
||||||
|
enhancements for things like incremental writing
|
||||||
|
- Minor tokenizer speedup
|
||||||
|
- Some flate decompressor bugs fixed
|
||||||
|
- Compression and decompression tests added
|
||||||
|
- Tests for new unicode handling added
|
||||||
|
- PdfReader.readpages() recursion error (issue #92) fixed.
|
||||||
|
- Initial crypt filter support added
|
||||||
|
|
||||||
|
|
||||||
|
0.3 -- Released 19 October, 2016.
|
||||||
|
|
||||||
|
- Python 3.5 added to test matrix
|
||||||
|
- Better support under Python 3.x for in-memory PDF file-like objects
|
||||||
|
- Some pagemerge and Unicode patches added
|
||||||
|
- Changes to logging allow better coexistence with other packages
|
||||||
|
- Fix for "from pdfrw import \*"
|
||||||
|
- New fancy_watermark.py example shows off capabilities of pagemerge.py
|
||||||
|
- metadata.py example renamed to cat.py
|
||||||
|
|
||||||
|
|
||||||
|
0.2 -- Released 21 June, 2015. Supports Python 2.6, 2.7, 3.3, and 3.4.
|
||||||
|
|
||||||
|
- Several bugs have been fixed
|
||||||
|
- New regression test functionally tests core with dozens of
|
||||||
|
PDFs, and also tests examples.
|
||||||
|
- Core has been ported and tested on Python3 by round-tripping
|
||||||
|
several difficult files and observing binary matching results
|
||||||
|
across the different Python versions.
|
||||||
|
- Still only minimal support for compression and no support
|
||||||
|
for encryption or newer PDF features. (pdftk is useful
|
||||||
|
to put PDFs in a form that pdfrw can use.)
|
||||||
|
|
||||||
|
0.1 -- Released to PyPI in 2012. Supports Python 2.5 - 2.7
|
||||||
|
|
|
@ -0,0 +1,33 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
'''
|
||||||
|
usage: 4up.py my.pdf
|
||||||
|
|
||||||
|
Creates 4up.my.pdf with a single output page for every
|
||||||
|
4 input pages.
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
from pdfrw import PdfReader, PdfWriter, PageMerge
|
||||||
|
|
||||||
|
|
||||||
|
def get4(srcpages):
|
||||||
|
scale = 0.5
|
||||||
|
srcpages = PageMerge() + srcpages
|
||||||
|
x_increment, y_increment = (scale * i for i in srcpages.xobj_box[2:])
|
||||||
|
for i, page in enumerate(srcpages):
|
||||||
|
page.scale(scale)
|
||||||
|
page.x = x_increment if i & 1 else 0
|
||||||
|
page.y = 0 if i & 2 else y_increment
|
||||||
|
return srcpages.render()
|
||||||
|
|
||||||
|
|
||||||
|
inpfn, = sys.argv[1:]
|
||||||
|
outfn = '4up.' + os.path.basename(inpfn)
|
||||||
|
pages = PdfReader(inpfn).pages
|
||||||
|
writer = PdfWriter(outfn)
|
||||||
|
for index in range(0, len(pages), 4):
|
||||||
|
writer.addpage(get4(pages[index:index + 4]))
|
||||||
|
writer.write()
|
|
@ -0,0 +1,32 @@
|
||||||
|
Example programs:
|
||||||
|
|
||||||
|
4up.py -- Prints pages four-up
|
||||||
|
|
||||||
|
alter.py -- Simple example of making a very slight modification to a PDF.
|
||||||
|
|
||||||
|
booklet.py -- Converts a PDF into a booklet.
|
||||||
|
|
||||||
|
cat.py -- Concatenates multiple PDFs, adds metadata.
|
||||||
|
|
||||||
|
poster.py -- Changes the size of a PDF to create a poster
|
||||||
|
|
||||||
|
print_two.py -- this is used when printing two cut-down copies on a single sheet of paper (double-sided) Requires uncompressed PDF.
|
||||||
|
|
||||||
|
rotate.py -- This will rotate selected ranges of pages within a document.
|
||||||
|
|
||||||
|
subset.py -- This will retrieve a subset of pages from a document.
|
||||||
|
|
||||||
|
watermark.py -- Adds a watermark to a PDF
|
||||||
|
|
||||||
|
rl1/4up.py -- Same as 4up.py, using reportlab for output. Next simplest reportlab example.
|
||||||
|
|
||||||
|
rl1/booklet.py -- Version of print_booklet using reportlab for output.
|
||||||
|
|
||||||
|
rl1/platypus_pdf_template.py -- Example using a PDF page as a watermark background with reportlab.
|
||||||
|
|
||||||
|
rl1/subset.py -- Same as subset.py, using reportlab for output. Simplest reportlab example.
|
||||||
|
|
||||||
|
rl2/copy.py -- example of how you could parse a graphics stream and then use reportlab for output.
|
||||||
|
Works on a few different PDFs, probably not a suitable starting point for real
|
||||||
|
production work without a lot of work on the library functions.
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
'''
|
||||||
|
usage: alter.py my.pdf
|
||||||
|
|
||||||
|
Creates alter.my.pdf
|
||||||
|
|
||||||
|
Demonstrates making a slight alteration to a preexisting PDF file.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
from pdfrw import PdfReader, PdfWriter
|
||||||
|
|
||||||
|
inpfn, = sys.argv[1:]
|
||||||
|
outfn = 'alter.' + os.path.basename(inpfn)
|
||||||
|
|
||||||
|
trailer = PdfReader(inpfn)
|
||||||
|
trailer.Info.Title = 'My New Title Goes Here'
|
||||||
|
PdfWriter(outfn, trailer=trailer).write()
|
|
@ -0,0 +1,56 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
'''
|
||||||
|
usage: booklet.py [-p] my.pdf
|
||||||
|
|
||||||
|
Creates booklet.my.pdf
|
||||||
|
|
||||||
|
Pages organized in a form suitable for booklet printing, e.g.
|
||||||
|
to print 4 8.5x11 pages using a single 11x17 sheet (double-sided).
|
||||||
|
|
||||||
|
The output would be using the same type of sheet
|
||||||
|
and you can get up to 3 blank sides if -p is enabled.
|
||||||
|
|
||||||
|
Otherwise the two sides in the middle will be in original page size
|
||||||
|
and you can have 1 blank sides at most.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from pdfrw import PdfReader, PdfWriter, PageMerge
|
||||||
|
|
||||||
|
|
||||||
|
def fixpage(*pages):
|
||||||
|
result = PageMerge() + (x for x in pages if x is not None)
|
||||||
|
result[-1].x += result[0].w
|
||||||
|
return result.render()
|
||||||
|
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("input", help="Input pdf file name")
|
||||||
|
parser.add_argument("-p", "--padding", action = "store_true",
|
||||||
|
help="Padding the document so that all pages use the same type of sheet")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
inpfn = args.input
|
||||||
|
outfn = 'booklet.' + os.path.basename(inpfn)
|
||||||
|
ipages = PdfReader(inpfn).pages
|
||||||
|
|
||||||
|
if args.padding:
|
||||||
|
pad_to = 4
|
||||||
|
else:
|
||||||
|
pad_to = 2
|
||||||
|
|
||||||
|
# Make sure we have a correct number of sides
|
||||||
|
ipages += [None]*(-len(ipages)%pad_to)
|
||||||
|
|
||||||
|
opages = []
|
||||||
|
while len(ipages) > 2:
|
||||||
|
opages.append(fixpage(ipages.pop(), ipages.pop(0)))
|
||||||
|
opages.append(fixpage(ipages.pop(0), ipages.pop()))
|
||||||
|
|
||||||
|
opages += ipages
|
||||||
|
|
||||||
|
PdfWriter(outfn).addpages(opages).write()
|
|
@ -0,0 +1,35 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
'''
|
||||||
|
usage: cat.py <first.pdf> [<next.pdf> ...]
|
||||||
|
|
||||||
|
Creates cat.<first.pdf>
|
||||||
|
|
||||||
|
This file demonstrates two features:
|
||||||
|
|
||||||
|
1) Concatenating multiple input PDFs.
|
||||||
|
|
||||||
|
2) adding metadata to the PDF.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
from pdfrw import PdfReader, PdfWriter, IndirectPdfDict
|
||||||
|
|
||||||
|
inputs = sys.argv[1:]
|
||||||
|
assert inputs
|
||||||
|
outfn = 'cat.' + os.path.basename(inputs[0])
|
||||||
|
|
||||||
|
writer = PdfWriter()
|
||||||
|
for inpfn in inputs:
|
||||||
|
writer.addpages(PdfReader(inpfn).pages)
|
||||||
|
|
||||||
|
writer.trailer.Info = IndirectPdfDict(
|
||||||
|
Title='your title goes here',
|
||||||
|
Author='your name goes here',
|
||||||
|
Subject='what is it all about?',
|
||||||
|
Creator='some script goes here',
|
||||||
|
)
|
||||||
|
writer.write(outfn)
|
|
@ -0,0 +1,27 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
'''
|
||||||
|
usage: extract.py <some.pdf>
|
||||||
|
|
||||||
|
Locates Form XObjects and Image XObjects within the PDF,
|
||||||
|
and creates a new PDF containing these -- one per page.
|
||||||
|
|
||||||
|
Resulting file will be named extract.<some.pdf>
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
from pdfrw import PdfReader, PdfWriter
|
||||||
|
from pdfrw.findobjs import page_per_xobj
|
||||||
|
|
||||||
|
|
||||||
|
inpfn, = sys.argv[1:]
|
||||||
|
outfn = 'extract.' + os.path.basename(inpfn)
|
||||||
|
pages = list(page_per_xobj(PdfReader(inpfn).pages, margin=0.5*72))
|
||||||
|
if not pages:
|
||||||
|
raise IndexError("No XObjects found")
|
||||||
|
writer = PdfWriter(outfn)
|
||||||
|
writer.addpages(pages)
|
||||||
|
writer.write()
|
|
@ -0,0 +1,105 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
'''
|
||||||
|
Enhanced example of watermarking using form xobjects (pdfrw).
|
||||||
|
|
||||||
|
usage: fancy_watermark.py [-u] my.pdf single_page.pdf
|
||||||
|
|
||||||
|
Creates watermark.my.pdf, with every page overlaid with
|
||||||
|
first page from single_page.pdf. If -u is selected, watermark
|
||||||
|
will be placed underneath page (painted first).
|
||||||
|
|
||||||
|
The stock watermark.py program assumes all pages are the same
|
||||||
|
size. This example deals with pages of differing sizes in order
|
||||||
|
to show some concepts of positioning and scaling.
|
||||||
|
|
||||||
|
This version applies the watermark such that the upper right
|
||||||
|
corner of the watermark is at the upper right corner of the
|
||||||
|
document page for odd pages, and at the upper left corner
|
||||||
|
of the document page for even pages, for each page of the
|
||||||
|
document.
|
||||||
|
|
||||||
|
It also rescales the size of the watermark if the watermark
|
||||||
|
is too wide for the page.
|
||||||
|
|
||||||
|
These scaling and positioning adjustments can easily
|
||||||
|
be customized for any particular application.
|
||||||
|
|
||||||
|
To handle documents with different page sizes, a cache is
|
||||||
|
maintained of a modified intermediate watermark object
|
||||||
|
for each page size.
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
from pdfrw import PdfReader, PdfWriter, PageMerge
|
||||||
|
|
||||||
|
# Get all the filenames
|
||||||
|
|
||||||
|
argv = sys.argv[1:]
|
||||||
|
underneath = '-u' in argv
|
||||||
|
if underneath:
|
||||||
|
del argv[argv.index('-u')]
|
||||||
|
inpfn, wmarkfn = argv
|
||||||
|
outfn = 'watermark.' + os.path.basename(inpfn)
|
||||||
|
|
||||||
|
# Open both the source files
|
||||||
|
wmark_trailer = PdfReader(wmarkfn)
|
||||||
|
trailer = PdfReader(inpfn)
|
||||||
|
|
||||||
|
# Handle different sized pages in same document with
|
||||||
|
# a memoization cache, so we don't create more watermark
|
||||||
|
# objects than we need to (typically only one per document).
|
||||||
|
|
||||||
|
wmark_page = wmark_trailer.pages[0]
|
||||||
|
wmark_cache = {}
|
||||||
|
|
||||||
|
# Process every page
|
||||||
|
for pagenum, page in enumerate(trailer.pages, 1):
|
||||||
|
|
||||||
|
# Get the media box of the page, and see
|
||||||
|
# if we have a matching watermark in the cache
|
||||||
|
mbox = tuple(float(x) for x in page.MediaBox)
|
||||||
|
odd = pagenum & 1
|
||||||
|
key = mbox, odd
|
||||||
|
wmark = wmark_cache.get(key)
|
||||||
|
if wmark is None:
|
||||||
|
|
||||||
|
# Create and cache a new watermark object.
|
||||||
|
wmark = wmark_cache[key] = PageMerge().add(wmark_page)[0]
|
||||||
|
|
||||||
|
# The math is more complete than it probably needs to be,
|
||||||
|
# because the origin of all pages is almost always (0, 0).
|
||||||
|
# Nonetheless, we illustrate all the values and their names.
|
||||||
|
|
||||||
|
page_x, page_y, page_x1, page_y1 = mbox
|
||||||
|
page_w = page_x1 - page_x
|
||||||
|
page_h = page_y1 - page_y # For illustration, not used
|
||||||
|
|
||||||
|
# Scale the watermark if it is too wide for the page
|
||||||
|
# (Could do the same for height instead if needed)
|
||||||
|
if wmark.w > page_w:
|
||||||
|
wmark.scale(1.0 * page_w / wmark.w)
|
||||||
|
|
||||||
|
# Always put watermark at the top of the page
|
||||||
|
# (but see horizontal positioning for other ideas)
|
||||||
|
wmark.y += page_y1 - wmark.h
|
||||||
|
|
||||||
|
# For odd pages, put it at the left of the page,
|
||||||
|
# and for even pages, put it on the right of the page.
|
||||||
|
if odd:
|
||||||
|
wmark.x = page_x
|
||||||
|
else:
|
||||||
|
wmark.x += page_x1 - wmark.w
|
||||||
|
|
||||||
|
# Optimize the case where the watermark is same width
|
||||||
|
# as page.
|
||||||
|
if page_w == wmark.w:
|
||||||
|
wmark_cache[mbox, not odd] = wmark
|
||||||
|
|
||||||
|
# Add the watermark to the page
|
||||||
|
PageMerge(page).add(wmark, prepend=underneath).render()
|
||||||
|
|
||||||
|
# Write out the destination file
|
||||||
|
PdfWriter(outfn, trailer=trailer).write()
|
|
@ -0,0 +1,43 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
'''
|
||||||
|
usage: poster.py my.pdf
|
||||||
|
|
||||||
|
Shows how to change the size on a PDF.
|
||||||
|
|
||||||
|
Motivation:
|
||||||
|
|
||||||
|
My daughter needed to create a 48" x 36" poster, but her Mac
|
||||||
|
version of Powerpoint only wanted to output 8.5" x 11" for
|
||||||
|
some reason.
|
||||||
|
|
||||||
|
So she did an 8.5x11" output with 0.5" margin all around
|
||||||
|
(actual size of useful area 7.5x10") and we scaled it
|
||||||
|
up by 4.8.
|
||||||
|
|
||||||
|
We also copy the Info dict to the new PDF.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
from pdfrw import PdfReader, PdfWriter, PageMerge, IndirectPdfDict
|
||||||
|
|
||||||
|
|
||||||
|
def adjust(page, margin=36, scale=4.8):
|
||||||
|
info = PageMerge().add(page)
|
||||||
|
x1, y1, x2, y2 = info.xobj_box
|
||||||
|
viewrect = (margin, margin, x2 - x1 - 2 * margin, y2 - y1 - 2 * margin)
|
||||||
|
page = PageMerge().add(page, viewrect=viewrect)
|
||||||
|
page[0].scale(scale)
|
||||||
|
return page.render()
|
||||||
|
|
||||||
|
|
||||||
|
inpfn, = sys.argv[1:]
|
||||||
|
outfn = 'poster.' + os.path.basename(inpfn)
|
||||||
|
reader = PdfReader(inpfn)
|
||||||
|
writer = PdfWriter(outfn)
|
||||||
|
writer.addpage(adjust(reader.pages[0]))
|
||||||
|
writer.trailer.Info = IndirectPdfDict(reader.Info or {})
|
||||||
|
writer.write()
|
|
@ -0,0 +1,32 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
'''
|
||||||
|
usage: print_two.py my.pdf
|
||||||
|
|
||||||
|
Creates print_two.my.pdf
|
||||||
|
|
||||||
|
This is only useful when you can cut down sheets of paper to make two
|
||||||
|
small documents. Works for double-sided only right now.
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
from pdfrw import PdfReader, PdfWriter, PageMerge
|
||||||
|
|
||||||
|
|
||||||
|
def fixpage(page, count=[0]):
|
||||||
|
count[0] += 1
|
||||||
|
oddpage = (count[0] & 1)
|
||||||
|
|
||||||
|
result = PageMerge()
|
||||||
|
for rotation in (180 + 180 * oddpage, 180 * oddpage):
|
||||||
|
result.add(page, rotate=rotation)
|
||||||
|
result[1].x = result[0].w
|
||||||
|
return result.render()
|
||||||
|
|
||||||
|
|
||||||
|
inpfn, = sys.argv[1:]
|
||||||
|
outfn = 'print_two.' + os.path.basename(inpfn)
|
||||||
|
pages = PdfReader(inpfn).pages
|
||||||
|
PdfWriter(outfn).addpages(fixpage(x) for x in pages).write()
|
|
@ -0,0 +1,56 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
'''
|
||||||
|
usage: 4up.py my.pdf
|
||||||
|
|
||||||
|
|
||||||
|
Uses Form XObjects and reportlab to create 4up.my.pdf.
|
||||||
|
|
||||||
|
Demonstrates use of pdfrw with reportlab.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
from reportlab.pdfgen.canvas import Canvas
|
||||||
|
|
||||||
|
from pdfrw import PdfReader
|
||||||
|
from pdfrw.buildxobj import pagexobj
|
||||||
|
from pdfrw.toreportlab import makerl
|
||||||
|
|
||||||
|
|
||||||
|
def addpage(canvas, allpages):
|
||||||
|
pages = allpages[:4]
|
||||||
|
del allpages[:4]
|
||||||
|
|
||||||
|
x_max = max(page.BBox[2] for page in pages)
|
||||||
|
y_max = max(page.BBox[3] for page in pages)
|
||||||
|
|
||||||
|
canvas.setPageSize((x_max, y_max))
|
||||||
|
|
||||||
|
for index, page in enumerate(pages):
|
||||||
|
x = x_max * (index & 1) / 2.0
|
||||||
|
y = y_max * (index <= 1) / 2.0
|
||||||
|
canvas.saveState()
|
||||||
|
canvas.translate(x, y)
|
||||||
|
canvas.scale(0.5, 0.5)
|
||||||
|
canvas.doForm(makerl(canvas, page))
|
||||||
|
canvas.restoreState()
|
||||||
|
canvas.showPage()
|
||||||
|
|
||||||
|
|
||||||
|
def go(argv):
|
||||||
|
inpfn, = argv
|
||||||
|
outfn = '4up.' + os.path.basename(inpfn)
|
||||||
|
|
||||||
|
pages = PdfReader(inpfn).pages
|
||||||
|
pages = [pagexobj(x) for x in pages]
|
||||||
|
canvas = Canvas(outfn)
|
||||||
|
|
||||||
|
while pages:
|
||||||
|
addpage(canvas, pages)
|
||||||
|
canvas.save()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
go(sys.argv[1:])
|
|
@ -0,0 +1,9 @@
|
||||||
|
This directory contains example scripts which read in PDFs
|
||||||
|
and convert pages to PDF Form XObjects using pdfrw, and then
|
||||||
|
write out the PDFs using reportlab.
|
||||||
|
|
||||||
|
The examples, from easiest to hardest, are:
|
||||||
|
|
||||||
|
subset.py -- prints a subset of pages
|
||||||
|
4up.py -- prints pages 4-up
|
||||||
|
booklet.py -- creates a booklet out of the pages
|
|
@ -0,0 +1,68 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
'''
|
||||||
|
usage: booklet.py my.pdf
|
||||||
|
|
||||||
|
|
||||||
|
Uses Form XObjects and reportlab to create booklet.my.pdf.
|
||||||
|
|
||||||
|
Demonstrates use of pdfrw with reportlab.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
from reportlab.pdfgen.canvas import Canvas
|
||||||
|
|
||||||
|
from pdfrw import PdfReader
|
||||||
|
from pdfrw.buildxobj import pagexobj
|
||||||
|
from pdfrw.toreportlab import makerl
|
||||||
|
|
||||||
|
|
||||||
|
def read_and_double(inpfn):
|
||||||
|
pages = PdfReader(inpfn).pages
|
||||||
|
pages = [pagexobj(x) for x in pages]
|
||||||
|
if len(pages) & 1:
|
||||||
|
pages.append(pages[0]) # Sentinel -- get same size for back as front
|
||||||
|
|
||||||
|
xobjs = []
|
||||||
|
while len(pages) > 2:
|
||||||
|
xobjs.append((pages.pop(), pages.pop(0)))
|
||||||
|
xobjs.append((pages.pop(0), pages.pop()))
|
||||||
|
xobjs += [(x,) for x in pages]
|
||||||
|
return xobjs
|
||||||
|
|
||||||
|
|
||||||
|
def make_pdf(outfn, xobjpairs):
|
||||||
|
canvas = Canvas(outfn)
|
||||||
|
for xobjlist in xobjpairs:
|
||||||
|
x = y = 0
|
||||||
|
for xobj in xobjlist:
|
||||||
|
x += xobj.BBox[2]
|
||||||
|
y = max(y, xobj.BBox[3])
|
||||||
|
|
||||||
|
canvas.setPageSize((x, y))
|
||||||
|
|
||||||
|
# Handle blank back page
|
||||||
|
if len(xobjlist) > 1 and xobjlist[0] == xobjlist[-1]:
|
||||||
|
xobjlist = xobjlist[:1]
|
||||||
|
x = xobjlist[0].BBox[2]
|
||||||
|
else:
|
||||||
|
x = 0
|
||||||
|
y = 0
|
||||||
|
|
||||||
|
for xobj in xobjlist:
|
||||||
|
canvas.saveState()
|
||||||
|
canvas.translate(x, y)
|
||||||
|
canvas.doForm(makerl(canvas, xobj))
|
||||||
|
canvas.restoreState()
|
||||||
|
x += xobj.BBox[2]
|
||||||
|
canvas.showPage()
|
||||||
|
canvas.save()
|
||||||
|
|
||||||
|
|
||||||
|
inpfn, = sys.argv[1:]
|
||||||
|
outfn = 'booklet.' + os.path.basename(inpfn)
|
||||||
|
|
||||||
|
make_pdf(outfn, read_and_double(inpfn))
|
|
@ -0,0 +1,108 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
usage: platypus_pdf_template.py source.pdf
|
||||||
|
|
||||||
|
Creates platypus.source.pdf
|
||||||
|
|
||||||
|
Example of using pdfrw to use page 1 of a source PDF as the background
|
||||||
|
for other pages programmatically generated with Platypus.
|
||||||
|
|
||||||
|
Contributed by user asannes
|
||||||
|
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
from reportlab.platypus import PageTemplate, BaseDocTemplate, Frame
|
||||||
|
from reportlab.platypus import NextPageTemplate, Paragraph, PageBreak
|
||||||
|
from reportlab.platypus.tableofcontents import TableOfContents
|
||||||
|
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||||||
|
from reportlab.rl_config import defaultPageSize
|
||||||
|
from reportlab.lib.units import inch
|
||||||
|
from reportlab.graphics import renderPDF
|
||||||
|
|
||||||
|
from pdfrw import PdfReader
|
||||||
|
from pdfrw.buildxobj import pagexobj
|
||||||
|
from pdfrw.toreportlab import makerl
|
||||||
|
|
||||||
|
PAGE_WIDTH = defaultPageSize[0]
|
||||||
|
PAGE_HEIGHT = defaultPageSize[1]
|
||||||
|
|
||||||
|
|
||||||
|
class MyTemplate(PageTemplate):
|
||||||
|
"""The kernel of this example, where we use pdfrw to fill in the
|
||||||
|
background of a page before writing to it. This could be used to fill
|
||||||
|
in a water mark or similar."""
|
||||||
|
|
||||||
|
def __init__(self, pdf_template_filename, name=None):
|
||||||
|
frames = [Frame(
|
||||||
|
0.85 * inch,
|
||||||
|
0.5 * inch,
|
||||||
|
PAGE_WIDTH - 1.15 * inch,
|
||||||
|
PAGE_HEIGHT - (1.5 * inch)
|
||||||
|
)]
|
||||||
|
PageTemplate.__init__(self, name, frames)
|
||||||
|
# use first page as template
|
||||||
|
page = PdfReader(pdf_template_filename).pages[0]
|
||||||
|
self.page_template = pagexobj(page)
|
||||||
|
# Scale it to fill the complete page
|
||||||
|
self.page_xscale = PAGE_WIDTH/self.page_template.BBox[2]
|
||||||
|
self.page_yscale = PAGE_HEIGHT/self.page_template.BBox[3]
|
||||||
|
|
||||||
|
def beforeDrawPage(self, canvas, doc):
|
||||||
|
"""Draws the background before anything else"""
|
||||||
|
canvas.saveState()
|
||||||
|
rl_obj = makerl(canvas, self.page_template)
|
||||||
|
canvas.scale(self.page_xscale, self.page_yscale)
|
||||||
|
canvas.doForm(rl_obj)
|
||||||
|
canvas.restoreState()
|
||||||
|
|
||||||
|
|
||||||
|
class MyDocTemplate(BaseDocTemplate):
|
||||||
|
"""Used to apply heading to table of contents."""
|
||||||
|
|
||||||
|
def afterFlowable(self, flowable):
|
||||||
|
"""Adds Heading1 to table of contents"""
|
||||||
|
if flowable.__class__.__name__ == 'Paragraph':
|
||||||
|
style = flowable.style.name
|
||||||
|
text = flowable.getPlainText()
|
||||||
|
key = '%s' % self.seq.nextf('toc')
|
||||||
|
if style == 'Heading1':
|
||||||
|
self.canv.bookmarkPage(key)
|
||||||
|
self.notify('TOCEntry', [1, text, self.page, key])
|
||||||
|
|
||||||
|
|
||||||
|
def create_toc():
|
||||||
|
"""Creates the table of contents"""
|
||||||
|
table_of_contents = TableOfContents()
|
||||||
|
table_of_contents.dotsMinLevel = 0
|
||||||
|
header1 = ParagraphStyle(name='Heading1', fontSize=16, leading=16)
|
||||||
|
header2 = ParagraphStyle(name='Heading2', fontSize=14, leading=14)
|
||||||
|
table_of_contents.levelStyles = [header1, header2]
|
||||||
|
return [table_of_contents, PageBreak()]
|
||||||
|
|
||||||
|
|
||||||
|
def create_pdf(filename, pdf_template_filename):
|
||||||
|
"""Create the pdf, with all the contents"""
|
||||||
|
pdf_report = open(filename, "wb")
|
||||||
|
document = MyDocTemplate(pdf_report)
|
||||||
|
templates = [MyTemplate(pdf_template_filename, name='background')]
|
||||||
|
document.addPageTemplates(templates)
|
||||||
|
|
||||||
|
styles = getSampleStyleSheet()
|
||||||
|
elements = [NextPageTemplate('background')]
|
||||||
|
elements.extend(create_toc())
|
||||||
|
|
||||||
|
# Dummy content (hello world x 200)
|
||||||
|
for i in range(200):
|
||||||
|
elements.append(Paragraph("Hello World" + str(i), styles['Heading1']))
|
||||||
|
|
||||||
|
document.multiBuild(elements)
|
||||||
|
pdf_report.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
template, = sys.argv[1:]
|
||||||
|
output = 'platypus_pdf_template.' + os.path.basename(template)
|
||||||
|
create_pdf(output, template)
|
|
@ -0,0 +1,42 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
'''
|
||||||
|
usage: subset.py my.pdf firstpage lastpage
|
||||||
|
|
||||||
|
Creates subset_<pagenum>_to_<pagenum>.my.pdf
|
||||||
|
|
||||||
|
|
||||||
|
Uses Form XObjects and reportlab to create output file.
|
||||||
|
|
||||||
|
Demonstrates use of pdfrw with reportlab.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
from reportlab.pdfgen.canvas import Canvas
|
||||||
|
|
||||||
|
from pdfrw import PdfReader
|
||||||
|
from pdfrw.buildxobj import pagexobj
|
||||||
|
from pdfrw.toreportlab import makerl
|
||||||
|
|
||||||
|
|
||||||
|
def go(inpfn, firstpage, lastpage):
|
||||||
|
firstpage, lastpage = int(firstpage), int(lastpage)
|
||||||
|
outfn = 'subset.' + os.path.basename(inpfn)
|
||||||
|
|
||||||
|
pages = PdfReader(inpfn).pages
|
||||||
|
pages = [pagexobj(x) for x in pages[firstpage - 1:lastpage]]
|
||||||
|
canvas = Canvas(outfn)
|
||||||
|
|
||||||
|
for page in pages:
|
||||||
|
canvas.setPageSize((page.BBox[2], page.BBox[3]))
|
||||||
|
canvas.doForm(makerl(canvas, page))
|
||||||
|
canvas.showPage()
|
||||||
|
|
||||||
|
canvas.save()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
inpfn, firstpage, lastpage = sys.argv[1:]
|
||||||
|
go(inpfn, firstpage, lastpage)
|
|
@ -0,0 +1,5 @@
|
||||||
|
The copy.py demo in this directory parses the graphics stream from the PDF and actually plays it back through reportlab.
|
||||||
|
|
||||||
|
Doesn't yet handle fonts or unicode very well.
|
||||||
|
|
||||||
|
For a more practical demo, look at the Form XObjects approach in the examples/rl1 directory.
|
|
@ -0,0 +1,32 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
'''
|
||||||
|
usage: copy.py my.pdf
|
||||||
|
|
||||||
|
Creates copy.my.pdf
|
||||||
|
|
||||||
|
Uses somewhat-functional parser. For better results
|
||||||
|
for most things, see the Form XObject-based method.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
from reportlab.pdfgen.canvas import Canvas
|
||||||
|
|
||||||
|
from decodegraphics import parsepage
|
||||||
|
from pdfrw import PdfReader, PdfWriter, PdfArray
|
||||||
|
|
||||||
|
inpfn, = sys.argv[1:]
|
||||||
|
outfn = 'copy.' + os.path.basename(inpfn)
|
||||||
|
pages = PdfReader(inpfn, decompress=True).pages
|
||||||
|
canvas = Canvas(outfn, pageCompression=0)
|
||||||
|
|
||||||
|
for page in pages:
|
||||||
|
box = [float(x) for x in page.MediaBox]
|
||||||
|
assert box[0] == box[1] == 0, "demo won't work on this PDF"
|
||||||
|
canvas.setPageSize(box[2:])
|
||||||
|
parsepage(page, canvas)
|
||||||
|
canvas.showPage()
|
||||||
|
canvas.save()
|
|
@ -0,0 +1,457 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
'''
|
||||||
|
This file is an example parser that will parse a graphics stream
|
||||||
|
into a reportlab canvas.
|
||||||
|
|
||||||
|
Needs work on fonts and unicode, but works on a few PDFs.
|
||||||
|
|
||||||
|
Better to use Form XObjects for most things (see the example in rl1).
|
||||||
|
|
||||||
|
'''
|
||||||
|
from inspect import getargspec
|
||||||
|
|
||||||
|
from pdfrw import PdfTokens
|
||||||
|
from pdfrw.objects import PdfString
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
# Graphics parsing
|
||||||
|
|
||||||
|
|
||||||
|
def parse_array(self, token='[', params=None):
|
||||||
|
mylist = []
|
||||||
|
for token in self.tokens:
|
||||||
|
if token == ']':
|
||||||
|
break
|
||||||
|
mylist.append(token)
|
||||||
|
self.params.append(mylist)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_savestate(self, token='q', params=''):
|
||||||
|
self.canv.saveState()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_restorestate(self, token='Q', params=''):
|
||||||
|
self.canv.restoreState()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_transform(self, token='cm', params='ffffff'):
|
||||||
|
self.canv.transform(*params)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_linewidth(self, token='w', params='f'):
|
||||||
|
self.canv.setLineWidth(*params)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_linecap(self, token='J', params='i'):
|
||||||
|
self.canv.setLineCap(*params)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_linejoin(self, token='j', params='i'):
|
||||||
|
self.canv.setLineJoin(*params)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_miterlimit(self, token='M', params='f'):
|
||||||
|
self.canv.setMiterLimit(*params)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_dash(self, token='d', params='as'): # Array, string
|
||||||
|
self.canv.setDash(*params)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_intent(self, token='ri', params='n'):
|
||||||
|
# TODO: add logging
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def parse_flatness(self, token='i', params='i'):
|
||||||
|
# TODO: add logging
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def parse_gstate(self, token='gs', params='n'):
|
||||||
|
# TODO: add logging
|
||||||
|
# Could parse stuff we care about from here later
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def parse_move(self, token='m', params='ff'):
|
||||||
|
if self.gpath is None:
|
||||||
|
self.gpath = self.canv.beginPath()
|
||||||
|
self.gpath.moveTo(*params)
|
||||||
|
self.current_point = params
|
||||||
|
|
||||||
|
|
||||||
|
def parse_line(self, token='l', params='ff'):
|
||||||
|
self.gpath.lineTo(*params)
|
||||||
|
self.current_point = params
|
||||||
|
|
||||||
|
|
||||||
|
def parse_curve(self, token='c', params='ffffff'):
|
||||||
|
self.gpath.curveTo(*params)
|
||||||
|
self.current_point = params[-2:]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_curve1(self, token='v', params='ffff'):
|
||||||
|
parse_curve(self, token, tuple(self.current_point) + tuple(params))
|
||||||
|
|
||||||
|
|
||||||
|
def parse_curve2(self, token='y', params='ffff'):
|
||||||
|
parse_curve(self, token, tuple(params) + tuple(params[-2:]))
|
||||||
|
|
||||||
|
|
||||||
|
def parse_close(self, token='h', params=''):
|
||||||
|
self.gpath.close()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_rect(self, token='re', params='ffff'):
|
||||||
|
if self.gpath is None:
|
||||||
|
self.gpath = self.canv.beginPath()
|
||||||
|
self.gpath.rect(*params)
|
||||||
|
self.current_point = params[-2:]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_stroke(self, token='S', params=''):
|
||||||
|
finish_path(self, 1, 0, 0)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_close_stroke(self, token='s', params=''):
|
||||||
|
self.gpath.close()
|
||||||
|
finish_path(self, 1, 0, 0)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_fill(self, token='f', params=''):
|
||||||
|
finish_path(self, 0, 1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_fill_compat(self, token='F', params=''):
|
||||||
|
finish_path(self, 0, 1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_fill_even_odd(self, token='f*', params=''):
|
||||||
|
finish_path(self, 0, 1, 0)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_fill_stroke_even_odd(self, token='B*', params=''):
|
||||||
|
finish_path(self, 1, 1, 0)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_fill_stroke(self, token='B', params=''):
|
||||||
|
finish_path(self, 1, 1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_close_fill_stroke_even_odd(self, token='b*', params=''):
|
||||||
|
self.gpath.close()
|
||||||
|
finish_path(self, 1, 1, 0)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_close_fill_stroke(self, token='b', params=''):
|
||||||
|
self.gpath.close()
|
||||||
|
finish_path(self, 1, 1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_nop(self, token='n', params=''):
|
||||||
|
finish_path(self, 0, 0, 0)
|
||||||
|
|
||||||
|
|
||||||
|
def finish_path(self, stroke, fill, fillmode):
|
||||||
|
if self.gpath is not None:
|
||||||
|
canv = self.canv
|
||||||
|
canv._fillMode, oldmode = fillmode, canv._fillMode
|
||||||
|
canv.drawPath(self.gpath, stroke, fill)
|
||||||
|
canv._fillMode = oldmode
|
||||||
|
self.gpath = None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_clip_path(self, token='W', params=''):
|
||||||
|
# TODO: add logging
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def parse_clip_path_even_odd(self, token='W*', params=''):
|
||||||
|
# TODO: add logging
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def parse_stroke_gray(self, token='G', params='f'):
|
||||||
|
self.canv.setStrokeGray(*params)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_fill_gray(self, token='g', params='f'):
|
||||||
|
self.canv.setFillGray(*params)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_stroke_rgb(self, token='RG', params='fff'):
|
||||||
|
self.canv.setStrokeColorRGB(*params)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_fill_rgb(self, token='rg', params='fff'):
|
||||||
|
self.canv.setFillColorRGB(*params)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_stroke_cmyk(self, token='K', params='ffff'):
|
||||||
|
self.canv.setStrokeColorCMYK(*params)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_fill_cmyk(self, token='k', params='ffff'):
|
||||||
|
self.canv.setFillColorCMYK(*params)
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
# Text parsing
|
||||||
|
|
||||||
|
|
||||||
|
def parse_begin_text(self, token='BT', params=''):
|
||||||
|
assert self.tpath is None
|
||||||
|
self.tpath = self.canv.beginText()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_text_transform(self, token='Tm', params='ffffff'):
|
||||||
|
path = self.tpath
|
||||||
|
|
||||||
|
# Stoopid optimization to remove nop
|
||||||
|
try:
|
||||||
|
code = path._code
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if code[-1] == '1 0 0 1 0 0 Tm':
|
||||||
|
code.pop()
|
||||||
|
|
||||||
|
path.setTextTransform(*params)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_setfont(self, token='Tf', params='nf'):
|
||||||
|
fontinfo = self.fontdict[params[0]]
|
||||||
|
self.tpath._setFont(fontinfo.name, params[1])
|
||||||
|
self.curfont = fontinfo
|
||||||
|
|
||||||
|
|
||||||
|
def parse_text_out(self, token='Tj', params='t'):
|
||||||
|
text = params[0].decode(self.curfont.remap, self.curfont.twobyte)
|
||||||
|
self.tpath.textOut(text)
|
||||||
|
|
||||||
|
def parse_lf_text_out(self, token="'", params='t'):
|
||||||
|
self.tpath.textLine()
|
||||||
|
text = params[0].decode(self.curfont.remap, self.curfont.twobyte)
|
||||||
|
self.tpath.textOut(text)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_lf_text_out_with_spacing(self, token='"', params='fft'):
|
||||||
|
self.tpath.setWordSpace(params[0])
|
||||||
|
self.tpath.setCharSpace(params[1])
|
||||||
|
self.tpath.textLine()
|
||||||
|
text = params[2].decode(self.curfont.remap, self.curfont.twobyte)
|
||||||
|
self.tpath.textOut(text)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_TJ(self, token='TJ', params='a'):
|
||||||
|
remap = self.curfont.remap
|
||||||
|
twobyte = self.curfont.twobyte
|
||||||
|
result = []
|
||||||
|
for x in params[0]:
|
||||||
|
if isinstance(x, PdfString):
|
||||||
|
result.append(x.decode(remap, twobyte))
|
||||||
|
else:
|
||||||
|
# TODO: Adjust spacing between characters here
|
||||||
|
int(x)
|
||||||
|
text = ''.join(result)
|
||||||
|
self.tpath.textOut(text)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_end_text(self, token='ET', params=''):
|
||||||
|
assert self.tpath is not None
|
||||||
|
self.canv.drawText(self.tpath)
|
||||||
|
self.tpath = None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_move_cursor(self, token='Td', params='ff'):
|
||||||
|
self.tpath.moveCursor(params[0], -params[1])
|
||||||
|
|
||||||
|
|
||||||
|
def parse_set_leading(self, token='TL', params='f'):
|
||||||
|
self.tpath.setLeading(*params)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_text_line(self, token='T*', params=''):
|
||||||
|
self.tpath.textLine()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_set_char_space(self, token='Tc', params='f'):
|
||||||
|
self.tpath.setCharSpace(*params)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_set_word_space(self, token='Tw', params='f'):
|
||||||
|
self.tpath.setWordSpace(*params)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_set_hscale(self, token='Tz', params='f'):
|
||||||
|
self.tpath.setHorizScale(params[0] - 100)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_set_rise(self, token='Ts', params='f'):
|
||||||
|
self.tpath.setRise(*params)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_xobject(self, token='Do', params='n'):
|
||||||
|
# TODO: Need to do this
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class FontInfo(object):
|
||||||
|
''' Pretty basic -- needs a lot of work to work right for all fonts
|
||||||
|
'''
|
||||||
|
lookup = {
|
||||||
|
# WRONG -- have to learn about font stuff...
|
||||||
|
'BitstreamVeraSans': 'Helvetica',
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, source):
|
||||||
|
name = source.BaseFont[1:]
|
||||||
|
self.name = self.lookup.get(name, name)
|
||||||
|
self.remap = chr
|
||||||
|
self.twobyte = False
|
||||||
|
info = source.ToUnicode
|
||||||
|
if not info:
|
||||||
|
return
|
||||||
|
info = info.stream.split('beginbfchar')[1].split('endbfchar')[0]
|
||||||
|
info = list(PdfTokens(info))
|
||||||
|
assert not len(info) & 1
|
||||||
|
info2 = []
|
||||||
|
for x in info:
|
||||||
|
assert x[0] == '<' and x[-1] == '>' and len(x) in (4, 6), x
|
||||||
|
i = int(x[1:-1], 16)
|
||||||
|
info2.append(i)
|
||||||
|
self.remap = dict((x, chr(y)) for (x, y) in
|
||||||
|
zip(info2[::2], info2[1::2])).get
|
||||||
|
self.twobyte = len(info[0]) > 4
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
# Control structures
|
||||||
|
|
||||||
|
|
||||||
|
def findparsefuncs():
|
||||||
|
|
||||||
|
def checkname(n):
|
||||||
|
assert n.startswith('/')
|
||||||
|
return n
|
||||||
|
|
||||||
|
def checkarray(a):
|
||||||
|
assert isinstance(a, list), a
|
||||||
|
return a
|
||||||
|
|
||||||
|
def checktext(t):
|
||||||
|
assert isinstance(t, PdfString)
|
||||||
|
return t
|
||||||
|
|
||||||
|
fixparam = dict(f=float, i=int, n=checkname, a=checkarray,
|
||||||
|
s=str, t=checktext)
|
||||||
|
fixcache = {}
|
||||||
|
|
||||||
|
def fixlist(params):
|
||||||
|
try:
|
||||||
|
result = fixcache[params]
|
||||||
|
except KeyError:
|
||||||
|
result = tuple(fixparam[x] for x in params)
|
||||||
|
fixcache[params] = result
|
||||||
|
return result
|
||||||
|
|
||||||
|
dispatch = {}
|
||||||
|
expected_args = 'self token params'.split()
|
||||||
|
for key, func in globals().items():
|
||||||
|
if key.startswith('parse_'):
|
||||||
|
args, varargs, keywords, defaults = getargspec(func)
|
||||||
|
assert (args == expected_args and varargs is None and
|
||||||
|
keywords is None and len(defaults) == 2), (
|
||||||
|
key, args, varargs, keywords, defaults)
|
||||||
|
token, params = defaults
|
||||||
|
if params is not None:
|
||||||
|
params = fixlist(params)
|
||||||
|
value = func, params
|
||||||
|
assert dispatch.setdefault(token, value) is value, repr(token)
|
||||||
|
return dispatch
|
||||||
|
|
||||||
|
|
||||||
|
class _ParseClass(object):
|
||||||
|
dispatch = findparsefuncs()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def parsepage(cls, page, canvas=None):
|
||||||
|
self = cls()
|
||||||
|
contents = page.Contents
|
||||||
|
if contents.Filter is not None:
|
||||||
|
raise SystemExit('Cannot parse graphics -- page encoded with %s'
|
||||||
|
% contents.Filter)
|
||||||
|
dispatch = cls.dispatch.get
|
||||||
|
self.tokens = tokens = iter(PdfTokens(contents.stream))
|
||||||
|
self.params = params = []
|
||||||
|
self.canv = canvas
|
||||||
|
self.gpath = None
|
||||||
|
self.tpath = None
|
||||||
|
self.fontdict = dict((x, FontInfo(y)) for
|
||||||
|
(x, y) in page.Resources.Font.items())
|
||||||
|
|
||||||
|
for token in self.tokens:
|
||||||
|
info = dispatch(token)
|
||||||
|
if info is None:
|
||||||
|
params.append(token)
|
||||||
|
continue
|
||||||
|
func, paraminfo = info
|
||||||
|
if paraminfo is None:
|
||||||
|
func(self, token, ())
|
||||||
|
continue
|
||||||
|
delta = len(params) - len(paraminfo)
|
||||||
|
if delta:
|
||||||
|
if delta < 0:
|
||||||
|
print ('Operator %s expected %s parameters, got %s' %
|
||||||
|
(token, len(paraminfo), params))
|
||||||
|
params[:] = []
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
print ("Unparsed parameters/commands: %s" % params[:delta])
|
||||||
|
del params[:delta]
|
||||||
|
paraminfo = zip(paraminfo, params)
|
||||||
|
try:
|
||||||
|
params[:] = [x(y) for (x, y) in paraminfo]
|
||||||
|
except:
|
||||||
|
for i, (x, y) in enumerate(paraminfo):
|
||||||
|
try:
|
||||||
|
x(y)
|
||||||
|
except:
|
||||||
|
raise # For now
|
||||||
|
continue
|
||||||
|
func(self, token, params)
|
||||||
|
params[:] = []
|
||||||
|
|
||||||
|
|
||||||
|
def debugparser(undisturbed=set('parse_array'.split())):
|
||||||
|
def debugdispatch():
|
||||||
|
def getvalue(oldval):
|
||||||
|
name = oldval[0].__name__
|
||||||
|
|
||||||
|
def myfunc(self, token, params):
|
||||||
|
print ('%s called %s(%s)' % (token, name,
|
||||||
|
', '.join(str(x) for x in params)))
|
||||||
|
if name in undisturbed:
|
||||||
|
myfunc = oldval[0]
|
||||||
|
return myfunc, oldval[1]
|
||||||
|
return dict((x, getvalue(y))
|
||||||
|
for (x, y) in _ParseClass.dispatch.items())
|
||||||
|
|
||||||
|
class _DebugParse(_ParseClass):
|
||||||
|
dispatch = debugdispatch()
|
||||||
|
|
||||||
|
return _DebugParse.parsepage
|
||||||
|
|
||||||
|
parsepage = _ParseClass.parsepage
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import sys
|
||||||
|
from pdfrw import PdfReader
|
||||||
|
parse = debugparser()
|
||||||
|
fname, = sys.argv[1:]
|
||||||
|
pdf = PdfReader(fname, decompress=True)
|
||||||
|
for i, page in enumerate(pdf.pages):
|
||||||
|
print ('\nPage %s ------------------------------------' % i)
|
||||||
|
parse(page)
|
|
@ -0,0 +1,41 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
'''
|
||||||
|
usage: rotate.py my.pdf rotation [page[range] ...]
|
||||||
|
eg. rotate.py 270 1-3 5 7-9
|
||||||
|
|
||||||
|
Rotation must be multiple of 90 degrees, clockwise.
|
||||||
|
|
||||||
|
Creates rotate.my.pdf with selected pages rotated. Rotates all by default.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
from pdfrw import PdfReader, PdfWriter
|
||||||
|
|
||||||
|
inpfn = sys.argv[1]
|
||||||
|
rotate = sys.argv[2]
|
||||||
|
ranges = sys.argv[3:]
|
||||||
|
|
||||||
|
rotate = int(rotate)
|
||||||
|
assert rotate % 90 == 0
|
||||||
|
|
||||||
|
ranges = [[int(y) for y in x.split('-')] for x in ranges]
|
||||||
|
outfn = 'rotate.%s' % os.path.basename(inpfn)
|
||||||
|
trailer = PdfReader(inpfn)
|
||||||
|
pages = trailer.pages
|
||||||
|
|
||||||
|
if not ranges:
|
||||||
|
ranges = [[1, len(pages)]]
|
||||||
|
|
||||||
|
for onerange in ranges:
|
||||||
|
onerange = (onerange + onerange[-1:])[:2]
|
||||||
|
for pagenum in range(onerange[0]-1, onerange[1]):
|
||||||
|
pages[pagenum].Rotate = (int(pages[pagenum].inheritable.Rotate or
|
||||||
|
0) + rotate) % 360
|
||||||
|
|
||||||
|
outdata = PdfWriter(outfn)
|
||||||
|
outdata.trailer = trailer
|
||||||
|
outdata.write()
|
|
@ -0,0 +1,29 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
'''
|
||||||
|
usage: subset.py my.pdf page[range] [page[range]] ...
|
||||||
|
eg. subset.py 1-3 5 7-9
|
||||||
|
|
||||||
|
Creates subset.my.pdf
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
from pdfrw import PdfReader, PdfWriter
|
||||||
|
|
||||||
|
inpfn = sys.argv[1]
|
||||||
|
ranges = sys.argv[2:]
|
||||||
|
assert ranges, "Expected at least one range"
|
||||||
|
|
||||||
|
ranges = ([int(y) for y in x.split('-')] for x in ranges)
|
||||||
|
outfn = 'subset.%s' % os.path.basename(inpfn)
|
||||||
|
pages = PdfReader(inpfn).pages
|
||||||
|
outdata = PdfWriter(outfn)
|
||||||
|
|
||||||
|
for onerange in ranges:
|
||||||
|
onerange = (onerange + onerange[-1:])[:2]
|
||||||
|
for pagenum in range(onerange[0], onerange[1]+1):
|
||||||
|
outdata.addpage(pages[pagenum-1])
|
||||||
|
outdata.write()
|
|
@ -0,0 +1,61 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
'''
|
||||||
|
usage: subset_booklets.py my.pdf
|
||||||
|
|
||||||
|
Creates subset_booklets.my.pdf
|
||||||
|
|
||||||
|
Pages organized in a form suitable for booklet printing, e.g.
|
||||||
|
to print 4 8.5x11 pages using a single 11x17 sheet (double-sided).
|
||||||
|
Instead of a large booklet, the pdf is divided into several mini
|
||||||
|
booklets. The reason is: professional printing works this way:
|
||||||
|
- Print all of several mini booklets(subsets of booklet);
|
||||||
|
- Saw each mini booklet individually;
|
||||||
|
- glue them all together;
|
||||||
|
- Insert the cover.
|
||||||
|
|
||||||
|
Take a look at http://www.wikihow.com/Bind-a-Book
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from pdfrw import PdfReader, PdfWriter, PageMerge
|
||||||
|
|
||||||
|
BOOKLET_SIZE = 20
|
||||||
|
START = time.time()
|
||||||
|
|
||||||
|
def fixpage(*pages):
|
||||||
|
result = PageMerge() + (x for x in pages if x is not None)
|
||||||
|
result[-1].x += result[0].w
|
||||||
|
return result.render()
|
||||||
|
|
||||||
|
INPFN, = sys.argv[1:]
|
||||||
|
OUTFN = 'booklet.' + os.path.basename(INPFN)
|
||||||
|
ALL_IPAGES = PdfReader(INPFN).pages
|
||||||
|
print 'The pdf file '+str(INPFN)+' has '+str(len(ALL_IPAGES))+' pages.'
|
||||||
|
|
||||||
|
#Make sure we have an even number
|
||||||
|
if len(ALL_IPAGES) & 1:
|
||||||
|
ALL_IPAGES.append(None)
|
||||||
|
print 'Inserting one more blank page to make pages number even.'
|
||||||
|
NUM_OF_ITER, ITERS_LEFT = divmod(len(ALL_IPAGES), BOOKLET_SIZE)
|
||||||
|
|
||||||
|
print 'Making '+str(NUM_OF_ITER)+' subbooklets of '+str(BOOKLET_SIZE)+' pages each.'
|
||||||
|
opages = []
|
||||||
|
for iteration in range(0, NUM_OF_ITER):
|
||||||
|
ipages = ALL_IPAGES[iteration*BOOKLET_SIZE:(iteration+1)*BOOKLET_SIZE]
|
||||||
|
while len(ipages) > 2:
|
||||||
|
opages.append(fixpage(ipages.pop(), ipages.pop(0)))
|
||||||
|
opages.append(fixpage(ipages.pop(0), ipages.pop()))
|
||||||
|
|
||||||
|
# Making one more subbooklet with the left pages
|
||||||
|
ipages = ALL_IPAGES[len(ALL_IPAGES)-ITERS_LEFT:len(ALL_IPAGES)]
|
||||||
|
while len(ipages) > 2:
|
||||||
|
opages.append(fixpage(ipages.pop(), ipages.pop(0)))
|
||||||
|
opages.append(fixpage(ipages.pop(0), ipages.pop()))
|
||||||
|
if len(ipages) >= 1:
|
||||||
|
opages.append(fixpage(ipages.pop(), ipages.pop(0)))
|
||||||
|
|
||||||
|
PdfWriter(OUTFN).addpages(opages).write()
|
||||||
|
print 'It took '+ str(round(time.time()-START, 2))+' seconds to make the pdf subbooklets changes.'
|
|
@ -0,0 +1,32 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
'''
|
||||||
|
usage: unspread.py my.pdf
|
||||||
|
|
||||||
|
Creates unspread.my.pdf
|
||||||
|
|
||||||
|
Chops each page in half, e.g. if a source were
|
||||||
|
created in booklet form, you could extract individual
|
||||||
|
pages.
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
from pdfrw import PdfReader, PdfWriter, PageMerge
|
||||||
|
|
||||||
|
|
||||||
|
def splitpage(src):
|
||||||
|
''' Split a page into two (left and right)
|
||||||
|
'''
|
||||||
|
# Yield a result for each half of the page
|
||||||
|
for x_pos in (0, 0.5):
|
||||||
|
yield PageMerge().add(src, viewrect=(x_pos, 0, 0.5, 1)).render()
|
||||||
|
|
||||||
|
|
||||||
|
inpfn, = sys.argv[1:]
|
||||||
|
outfn = 'unspread.' + os.path.basename(inpfn)
|
||||||
|
writer = PdfWriter(outfn)
|
||||||
|
for page in PdfReader(inpfn).pages:
|
||||||
|
writer.addpages(splitpage(page))
|
||||||
|
writer.write()
|
|
@ -0,0 +1,37 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
'''
|
||||||
|
Simple example of watermarking using form xobjects (pdfrw).
|
||||||
|
|
||||||
|
usage: watermark.py [-u] my.pdf single_page.pdf
|
||||||
|
|
||||||
|
Creates watermark.my.pdf, with every page overlaid with
|
||||||
|
first page from single_page.pdf. If -u is selected, watermark
|
||||||
|
will be placed underneath page (painted first).
|
||||||
|
|
||||||
|
NOTE 1: This program assumes that all pages (including the watermark
|
||||||
|
page) are the same size. For other possibilities, see
|
||||||
|
the fancy_watermark.py example.
|
||||||
|
|
||||||
|
NOTE 2: At one point, this example was extremely complicated, with
|
||||||
|
multiple options. That only led to errors in implementation,
|
||||||
|
so it has been re-simplified in order to show basic principles
|
||||||
|
of the library operation and to match the other examples better.
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
from pdfrw import PdfReader, PdfWriter, PageMerge
|
||||||
|
|
||||||
|
argv = sys.argv[1:]
|
||||||
|
underneath = '-u' in argv
|
||||||
|
if underneath:
|
||||||
|
del argv[argv.index('-u')]
|
||||||
|
inpfn, wmarkfn = argv
|
||||||
|
outfn = 'watermark.' + os.path.basename(inpfn)
|
||||||
|
wmark = PageMerge().add(PdfReader(wmarkfn).pages[0])[0]
|
||||||
|
trailer = PdfReader(inpfn)
|
||||||
|
for page in trailer.pages:
|
||||||
|
PageMerge(page).add(wmark, prepend=underneath).render()
|
||||||
|
PdfWriter(outfn, trailer=trailer).write()
|
|
@ -0,0 +1,23 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
from .pdfwriter import PdfWriter
|
||||||
|
from .pdfreader import PdfReader
|
||||||
|
from .objects import (PdfObject, PdfName, PdfArray,
|
||||||
|
PdfDict, IndirectPdfDict, PdfString)
|
||||||
|
from .tokens import PdfTokens
|
||||||
|
from .errors import PdfParseError
|
||||||
|
from .pagemerge import PageMerge
|
||||||
|
|
||||||
|
__version__ = '0.4'
|
||||||
|
|
||||||
|
# Add a tiny bit of compatibility to pyPdf
|
||||||
|
|
||||||
|
PdfFileReader = PdfReader
|
||||||
|
PdfFileWriter = PdfWriter
|
||||||
|
|
||||||
|
__all__ = """PdfWriter PdfReader PdfObject PdfName PdfArray
|
||||||
|
PdfTokens PdfParseError PdfDict IndirectPdfDict
|
||||||
|
PdfString PageMerge""".split()
|
||||||
|
|
|
@ -0,0 +1,363 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
This module contains code to build PDF "Form XObjects".
|
||||||
|
|
||||||
|
A Form XObject allows a fragment from one PDF file to be cleanly
|
||||||
|
included in another PDF file.
|
||||||
|
|
||||||
|
Reference for syntax: "Parameters for opening PDF files" from SDK 8.1
|
||||||
|
|
||||||
|
http://www.adobe.com/devnet/acrobat/pdfs/pdf_open_parameters.pdf
|
||||||
|
|
||||||
|
supported 'page=xxx', 'viewrect=<left>,<top>,<width>,<height>'
|
||||||
|
|
||||||
|
Also supported by this, but not by Adobe:
|
||||||
|
'rotate=xxx' where xxx in [0, 90, 180, 270]
|
||||||
|
|
||||||
|
Units are in points
|
||||||
|
|
||||||
|
|
||||||
|
Reference for content: Adobe PDF reference, sixth edition, version 1.7
|
||||||
|
|
||||||
|
http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf
|
||||||
|
|
||||||
|
Form xobjects discussed chapter 4.9, page 355
|
||||||
|
'''
|
||||||
|
|
||||||
|
from .objects import PdfDict, PdfArray, PdfName
|
||||||
|
from .pdfreader import PdfReader
|
||||||
|
from .errors import log, PdfNotImplementedError
|
||||||
|
from .py23_diffs import iteritems
|
||||||
|
from .uncompress import uncompress
|
||||||
|
from .compress import compress
|
||||||
|
|
||||||
|
|
||||||
|
class ViewInfo(object):
|
||||||
|
''' Instantiate ViewInfo with a uri, and it will parse out
|
||||||
|
the filename, page, and viewrect into object attributes.
|
||||||
|
|
||||||
|
Note 1:
|
||||||
|
Viewrects follow the adobe definition. (See reference
|
||||||
|
above). They are arrays of 4 numbers:
|
||||||
|
|
||||||
|
- Distance from left of document in points
|
||||||
|
- Distance from top (NOT bottom) of document in points
|
||||||
|
- Width of rectangle in points
|
||||||
|
- Height of rectangle in points
|
||||||
|
|
||||||
|
Note 2:
|
||||||
|
For simplicity, Viewrects can also be specified
|
||||||
|
in fractions of the document. If every number in
|
||||||
|
the viewrect is between 0 and 1 inclusive, then
|
||||||
|
viewrect elements 0 and 2 are multiplied by the
|
||||||
|
mediabox width before use, and viewrect elements
|
||||||
|
1 and 3 are multiplied by the mediabox height before
|
||||||
|
use.
|
||||||
|
|
||||||
|
Note 3:
|
||||||
|
By default, an XObject based on the view will be
|
||||||
|
cacheable. It should not be cacheable if the XObject
|
||||||
|
will be subsequently modified.
|
||||||
|
'''
|
||||||
|
doc = None
|
||||||
|
docname = None
|
||||||
|
page = None
|
||||||
|
viewrect = None
|
||||||
|
rotate = None
|
||||||
|
cacheable = True
|
||||||
|
|
||||||
|
def __init__(self, pageinfo='', **kw):
|
||||||
|
pageinfo = pageinfo.split('#', 1)
|
||||||
|
if len(pageinfo) == 2:
|
||||||
|
pageinfo[1:] = pageinfo[1].replace('&', '#').split('#')
|
||||||
|
for key in 'page viewrect'.split():
|
||||||
|
if pageinfo[0].startswith(key + '='):
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
self.docname = pageinfo.pop(0)
|
||||||
|
for item in pageinfo:
|
||||||
|
key, value = item.split('=')
|
||||||
|
key = key.strip()
|
||||||
|
value = value.replace(',', ' ').split()
|
||||||
|
if key in ('page', 'rotate'):
|
||||||
|
assert len(value) == 1
|
||||||
|
setattr(self, key, int(value[0]))
|
||||||
|
elif key == 'viewrect':
|
||||||
|
assert len(value) == 4
|
||||||
|
setattr(self, key, [float(x) for x in value])
|
||||||
|
else:
|
||||||
|
log.error('Unknown option: %s', key)
|
||||||
|
for key, value in iteritems(kw):
|
||||||
|
assert hasattr(self, key), key
|
||||||
|
setattr(self, key, value)
|
||||||
|
|
||||||
|
|
||||||
|
def get_rotation(rotate):
|
||||||
|
''' Return clockwise rotation code:
|
||||||
|
0 = unrotated
|
||||||
|
1 = 90 degrees
|
||||||
|
2 = 180 degrees
|
||||||
|
3 = 270 degrees
|
||||||
|
'''
|
||||||
|
try:
|
||||||
|
rotate = int(rotate)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return 0
|
||||||
|
if rotate % 90 != 0:
|
||||||
|
return 0
|
||||||
|
return rotate // 90
|
||||||
|
|
||||||
|
|
||||||
|
def rotate_point(point, rotation):
|
||||||
|
''' Rotate an (x,y) coordinate clockwise by a
|
||||||
|
rotation code specifying a multiple of 90 degrees.
|
||||||
|
'''
|
||||||
|
if rotation & 1:
|
||||||
|
point = point[1], -point[0]
|
||||||
|
if rotation & 2:
|
||||||
|
point = -point[0], -point[1]
|
||||||
|
return point
|
||||||
|
|
||||||
|
|
||||||
|
def rotate_rect(rect, rotation):
|
||||||
|
''' Rotate both points within the rectangle, then normalize
|
||||||
|
the rectangle by returning the new lower left, then new
|
||||||
|
upper right.
|
||||||
|
'''
|
||||||
|
rect = rotate_point(rect[:2], rotation) + rotate_point(rect[2:], rotation)
|
||||||
|
return (min(rect[0], rect[2]), min(rect[1], rect[3]),
|
||||||
|
max(rect[0], rect[2]), max(rect[1], rect[3]))
|
||||||
|
|
||||||
|
|
||||||
|
def getrects(inheritable, pageinfo, rotation):
|
||||||
|
''' Given the inheritable attributes of a page and
|
||||||
|
the desired pageinfo rectangle, return the page's
|
||||||
|
media box and the calculated boundary (clip) box.
|
||||||
|
'''
|
||||||
|
mbox = tuple([float(x) for x in inheritable.MediaBox])
|
||||||
|
cbox = tuple([float(x) for x in (inheritable.CropBox or mbox)])
|
||||||
|
vrect = pageinfo.viewrect
|
||||||
|
if vrect is not None:
|
||||||
|
# Rotate the media box to match what the user sees,
|
||||||
|
# figure out the clipping box, then rotate back
|
||||||
|
mleft, mbot, mright, mtop = rotate_rect(cbox, rotation)
|
||||||
|
x, y, w, h = vrect
|
||||||
|
|
||||||
|
# Support operations in fractions of a page
|
||||||
|
if 0 <= min(vrect) < max(vrect) <= 1:
|
||||||
|
mw = mright - mleft
|
||||||
|
mh = mtop - mbot
|
||||||
|
x *= mw
|
||||||
|
w *= mw
|
||||||
|
y *= mh
|
||||||
|
h *= mh
|
||||||
|
|
||||||
|
cleft = mleft + x
|
||||||
|
ctop = mtop - y
|
||||||
|
cright = cleft + w
|
||||||
|
cbot = ctop - h
|
||||||
|
cbox = (max(mleft, cleft), max(mbot, cbot),
|
||||||
|
min(mright, cright), min(mtop, ctop))
|
||||||
|
cbox = rotate_rect(cbox, -rotation)
|
||||||
|
return mbox, cbox
|
||||||
|
|
||||||
|
|
||||||
|
def _build_cache(contents, allow_compressed):
|
||||||
|
''' Build a new dictionary holding the stream,
|
||||||
|
and save it along with private cache info.
|
||||||
|
Assumes validity has been pre-checked if
|
||||||
|
we have a non-None xobj_copy.
|
||||||
|
|
||||||
|
Also, the spec says nothing about nested arrays,
|
||||||
|
so we assume those don't exist until we see one
|
||||||
|
in the wild.
|
||||||
|
'''
|
||||||
|
try:
|
||||||
|
xobj_copy = contents.xobj_copy
|
||||||
|
except AttributeError:
|
||||||
|
# Should have a PdfArray here...
|
||||||
|
array = contents
|
||||||
|
private = contents
|
||||||
|
else:
|
||||||
|
# Should have a PdfDict here -- might or might not have cache copy
|
||||||
|
if xobj_copy is not None:
|
||||||
|
return xobj_copy
|
||||||
|
array = [contents]
|
||||||
|
private = contents.private
|
||||||
|
|
||||||
|
# If we don't allow compressed objects, OR if we have multiple compressed
|
||||||
|
# objects, we try to decompress them, and fail if we cannot do that.
|
||||||
|
|
||||||
|
if not allow_compressed or len(array) > 1:
|
||||||
|
keys = set(x[0] for cdict in array for x in iteritems(cdict))
|
||||||
|
was_compressed = len(keys) > 1
|
||||||
|
if was_compressed:
|
||||||
|
# Make copies of the objects before we uncompress them.
|
||||||
|
array = [PdfDict(x) for x in array]
|
||||||
|
if not uncompress(array):
|
||||||
|
raise PdfNotImplementedError(
|
||||||
|
'Xobjects with these compression parameters not supported: %s' %
|
||||||
|
keys)
|
||||||
|
|
||||||
|
xobj_copy = PdfDict(array[0])
|
||||||
|
xobj_copy.private.xobj_cachedict = {}
|
||||||
|
private.xobj_copy = xobj_copy
|
||||||
|
|
||||||
|
if len(array) > 1:
|
||||||
|
newstream = '\n'.join(x.stream for x in array)
|
||||||
|
newlength = sum(int(x.Length) for x in array) + len(array) - 1
|
||||||
|
assert newlength == len(newstream)
|
||||||
|
xobj_copy.stream = newstream
|
||||||
|
if was_compressed and allow_compressed:
|
||||||
|
compress(xobj_copy)
|
||||||
|
|
||||||
|
return xobj_copy
|
||||||
|
|
||||||
|
|
||||||
|
def _cache_xobj(contents, resources, mbox, bbox, rotation, cacheable=True):
|
||||||
|
''' Return a cached Form XObject, or create a new one and cache it.
|
||||||
|
Adds private members x, y, w, h
|
||||||
|
'''
|
||||||
|
cachedict = contents.xobj_cachedict
|
||||||
|
cachekey = mbox, bbox, rotation
|
||||||
|
result = cachedict.get(cachekey) if cacheable else None
|
||||||
|
if result is None:
|
||||||
|
# If we are not getting a full page, or if we are going to
|
||||||
|
# modify the results, first retrieve an underlying Form XObject
|
||||||
|
# that represents the entire page, so that we are not copying
|
||||||
|
# the full page data into the new file multiple times
|
||||||
|
func = (_get_fullpage, _get_subpage)[mbox != bbox or not cacheable]
|
||||||
|
result = PdfDict(
|
||||||
|
func(contents, resources, mbox),
|
||||||
|
Type=PdfName.XObject,
|
||||||
|
Subtype=PdfName.Form,
|
||||||
|
FormType=1,
|
||||||
|
BBox=PdfArray(bbox),
|
||||||
|
)
|
||||||
|
rect = bbox
|
||||||
|
if rotation:
|
||||||
|
matrix = (rotate_point((1, 0), rotation) +
|
||||||
|
rotate_point((0, 1), rotation))
|
||||||
|
result.Matrix = PdfArray(matrix + (0, 0))
|
||||||
|
rect = rotate_rect(rect, rotation)
|
||||||
|
|
||||||
|
private = result.private
|
||||||
|
private.x = rect[0]
|
||||||
|
private.y = rect[1]
|
||||||
|
private.w = rect[2] - rect[0]
|
||||||
|
private.h = rect[3] - rect[1]
|
||||||
|
if cacheable:
|
||||||
|
cachedict[cachekey] = result
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _get_fullpage(contents, resources, mbox):
|
||||||
|
''' fullpage is easy. Just copy the contents,
|
||||||
|
set up the resources, and let _cache_xobj handle the
|
||||||
|
rest.
|
||||||
|
'''
|
||||||
|
return PdfDict(contents, Resources=resources)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_subpage(contents, resources, mbox):
|
||||||
|
''' subpages *could* be as easy as full pages, but we
|
||||||
|
choose to complicate life by creating a Form XObject
|
||||||
|
for the page, and then one that references it for
|
||||||
|
the subpage, on the off-chance that we want multiple
|
||||||
|
items from the page.
|
||||||
|
'''
|
||||||
|
return PdfDict(
|
||||||
|
stream='/FullPage Do\n',
|
||||||
|
Resources=PdfDict(
|
||||||
|
XObject=PdfDict(
|
||||||
|
FullPage=_cache_xobj(contents, resources, mbox, mbox, 0)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True):
|
||||||
|
''' pagexobj creates and returns a Form XObject for
|
||||||
|
a given view within a page (Defaults to entire page.)
|
||||||
|
|
||||||
|
pagexobj is passed a page and a viewrect.
|
||||||
|
'''
|
||||||
|
inheritable = page.inheritable
|
||||||
|
resources = inheritable.Resources
|
||||||
|
rotation = get_rotation(inheritable.Rotate)
|
||||||
|
mbox, bbox = getrects(inheritable, viewinfo, rotation)
|
||||||
|
rotation += get_rotation(viewinfo.rotate)
|
||||||
|
contents = _build_cache(page.Contents, allow_compressed)
|
||||||
|
return _cache_xobj(contents, resources, mbox, bbox, rotation,
|
||||||
|
viewinfo.cacheable)
|
||||||
|
|
||||||
|
|
||||||
|
def docxobj(pageinfo, doc=None, allow_compressed=True):
|
||||||
|
''' docinfo reads a page out of a document and uses
|
||||||
|
pagexobj to create the Form XObject based on
|
||||||
|
the page.
|
||||||
|
|
||||||
|
This is a convenience function for things like
|
||||||
|
rst2pdf that want to be able to pass in textual
|
||||||
|
filename/location descriptors and don't want to
|
||||||
|
know about using PdfReader.
|
||||||
|
|
||||||
|
Can work standalone, or in conjunction with
|
||||||
|
the CacheXObj class (below).
|
||||||
|
|
||||||
|
'''
|
||||||
|
if not isinstance(pageinfo, ViewInfo):
|
||||||
|
pageinfo = ViewInfo(pageinfo)
|
||||||
|
|
||||||
|
# If we're explicitly passed a document,
|
||||||
|
# make sure we don't have one implicitly as well.
|
||||||
|
# If no implicit or explicit doc, then read one in
|
||||||
|
# from the filename.
|
||||||
|
if doc is not None:
|
||||||
|
assert pageinfo.doc is None
|
||||||
|
pageinfo.doc = doc
|
||||||
|
elif pageinfo.doc is not None:
|
||||||
|
doc = pageinfo.doc
|
||||||
|
else:
|
||||||
|
doc = pageinfo.doc = PdfReader(pageinfo.docname,
|
||||||
|
decompress=not allow_compressed)
|
||||||
|
assert isinstance(doc, PdfReader)
|
||||||
|
|
||||||
|
sourcepage = doc.pages[(pageinfo.page or 1) - 1]
|
||||||
|
return pagexobj(sourcepage, pageinfo, allow_compressed)
|
||||||
|
|
||||||
|
|
||||||
|
class CacheXObj(object):
|
||||||
|
''' Use to keep from reparsing files over and over,
|
||||||
|
and to keep from making the output too much
|
||||||
|
bigger than it ought to be by replicating
|
||||||
|
unnecessary object copies.
|
||||||
|
|
||||||
|
This is a convenience function for things like
|
||||||
|
rst2pdf that want to be able to pass in textual
|
||||||
|
filename/location descriptors and don't want to
|
||||||
|
know about using PdfReader.
|
||||||
|
'''
|
||||||
|
def __init__(self, decompress=False):
|
||||||
|
''' Set decompress true if you need
|
||||||
|
the Form XObjects to be decompressed.
|
||||||
|
Will decompress what it can and scream
|
||||||
|
about the rest.
|
||||||
|
'''
|
||||||
|
self.cached_pdfs = {}
|
||||||
|
self.decompress = decompress
|
||||||
|
|
||||||
|
def load(self, sourcename):
|
||||||
|
''' Load a Form XObject from a uri
|
||||||
|
'''
|
||||||
|
info = ViewInfo(sourcename)
|
||||||
|
fname = info.docname
|
||||||
|
pcache = self.cached_pdfs
|
||||||
|
doc = pcache.get(fname)
|
||||||
|
if doc is None:
|
||||||
|
doc = pcache[fname] = PdfReader(fname, decompress=self.decompress)
|
||||||
|
return docxobj(info, doc, allow_compressed=not self.decompress)
|
|
@ -0,0 +1,27 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
'''
|
||||||
|
Currently, this sad little file only knows how to compress
|
||||||
|
using the flate (zlib) algorithm. Maybe more later, but it's
|
||||||
|
not a priority for me...
|
||||||
|
'''
|
||||||
|
|
||||||
|
from .objects import PdfName
|
||||||
|
from .uncompress import streamobjects
|
||||||
|
from .py23_diffs import zlib, convert_load, convert_store
|
||||||
|
|
||||||
|
|
||||||
|
def compress(mylist):
|
||||||
|
flate = PdfName.FlateDecode
|
||||||
|
for obj in streamobjects(mylist):
|
||||||
|
ftype = obj.Filter
|
||||||
|
if ftype is not None:
|
||||||
|
continue
|
||||||
|
oldstr = obj.stream
|
||||||
|
newstr = convert_load(zlib.compress(convert_store(oldstr)))
|
||||||
|
if len(newstr) < len(oldstr) + 30:
|
||||||
|
obj.stream = newstr
|
||||||
|
obj.Filter = flate
|
||||||
|
obj.DecodeParms = None
|
|
@ -0,0 +1,150 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2017 Jon Lund Steffensen
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
from __future__ import division
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import struct
|
||||||
|
|
||||||
|
try:
|
||||||
|
from Crypto.Cipher import ARC4, AES
|
||||||
|
HAS_CRYPTO = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_CRYPTO = False
|
||||||
|
|
||||||
|
from .objects import PdfDict, PdfName
|
||||||
|
|
||||||
|
_PASSWORD_PAD = (
|
||||||
|
'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08'
|
||||||
|
'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz')
|
||||||
|
|
||||||
|
|
||||||
|
def streamobjects(mylist, isinstance=isinstance, PdfDict=PdfDict):
|
||||||
|
for obj in mylist:
|
||||||
|
if isinstance(obj, PdfDict) and obj.stream is not None:
|
||||||
|
yield obj
|
||||||
|
|
||||||
|
|
||||||
|
def create_key(password, doc):
|
||||||
|
"""Create an encryption key (Algorithm 2 in PDF spec)."""
|
||||||
|
key_size = int(doc.Encrypt.Length or 40) // 8
|
||||||
|
padded_pass = (password + _PASSWORD_PAD)[:32]
|
||||||
|
hasher = hashlib.md5()
|
||||||
|
hasher.update(padded_pass)
|
||||||
|
hasher.update(doc.Encrypt.O.to_bytes())
|
||||||
|
hasher.update(struct.pack('<i', int(doc.Encrypt.P)))
|
||||||
|
hasher.update(doc.ID[0].to_bytes())
|
||||||
|
temp_hash = hasher.digest()
|
||||||
|
|
||||||
|
if int(doc.Encrypt.R or 0) >= 3:
|
||||||
|
for _ in range(50):
|
||||||
|
temp_hash = hashlib.md5(temp_hash[:key_size]).digest()
|
||||||
|
|
||||||
|
return temp_hash[:key_size]
|
||||||
|
|
||||||
|
|
||||||
|
def create_user_hash(key, doc):
|
||||||
|
"""Create the user password hash (Algorithm 4/5)."""
|
||||||
|
revision = int(doc.Encrypt.R or 0)
|
||||||
|
if revision < 3:
|
||||||
|
cipher = ARC4.new(key)
|
||||||
|
return cipher.encrypt(_PASSWORD_PAD)
|
||||||
|
else:
|
||||||
|
hasher = hashlib.md5()
|
||||||
|
hasher.update(_PASSWORD_PAD)
|
||||||
|
hasher.update(doc.ID[0].to_bytes())
|
||||||
|
temp_hash = hasher.digest()
|
||||||
|
|
||||||
|
for i in range(20):
|
||||||
|
temp_key = ''.join(chr(i ^ ord(x)) for x in key)
|
||||||
|
cipher = ARC4.new(temp_key)
|
||||||
|
temp_hash = cipher.encrypt(temp_hash)
|
||||||
|
|
||||||
|
return temp_hash
|
||||||
|
|
||||||
|
|
||||||
|
def check_user_password(key, doc):
|
||||||
|
"""Check that the user password is correct (Algorithm 6)."""
|
||||||
|
expect_user_hash = create_user_hash(key, doc)
|
||||||
|
revision = int(doc.Encrypt.R or 0)
|
||||||
|
if revision < 3:
|
||||||
|
return doc.Encrypt.U.to_bytes() == expect_user_hash
|
||||||
|
else:
|
||||||
|
return doc.Encrypt.U.to_bytes()[:16] == expect_user_hash
|
||||||
|
|
||||||
|
|
||||||
|
class AESCryptFilter(object):
|
||||||
|
"""Crypt filter corresponding to /AESV2."""
|
||||||
|
def __init__(self, key):
|
||||||
|
self._key = key
|
||||||
|
|
||||||
|
def decrypt_data(self, num, gen, data):
|
||||||
|
"""Decrypt data (string/stream) using key (Algorithm 1)."""
|
||||||
|
key_extension = struct.pack('<i', num)[:3]
|
||||||
|
key_extension += struct.pack('<i', gen)[:2]
|
||||||
|
key_extension += 'sAlT'
|
||||||
|
temp_key = self._key + key_extension
|
||||||
|
temp_key = hashlib.md5(temp_key).digest()
|
||||||
|
|
||||||
|
iv = data[:AES.block_size]
|
||||||
|
cipher = AES.new(temp_key, AES.MODE_CBC, iv)
|
||||||
|
decrypted = cipher.decrypt(data[AES.block_size:])
|
||||||
|
|
||||||
|
# Remove padding
|
||||||
|
pad_size = ord(decrypted[-1])
|
||||||
|
assert 1 <= pad_size <= 16
|
||||||
|
return decrypted[:-pad_size]
|
||||||
|
|
||||||
|
|
||||||
|
class RC4CryptFilter(object):
|
||||||
|
"""Crypt filter corresponding to /V2."""
|
||||||
|
def __init__(self, key):
|
||||||
|
self._key = key
|
||||||
|
|
||||||
|
def decrypt_data(self, num, gen, data):
|
||||||
|
"""Decrypt data (string/stream) using key (Algorithm 1)."""
|
||||||
|
new_key_size = min(len(self._key) + 5, 16)
|
||||||
|
key_extension = struct.pack('<i', num)[:3]
|
||||||
|
key_extension += struct.pack('<i', gen)[:2]
|
||||||
|
temp_key = self._key + key_extension
|
||||||
|
temp_key = hashlib.md5(temp_key).digest()[:new_key_size]
|
||||||
|
|
||||||
|
cipher = ARC4.new(temp_key)
|
||||||
|
return cipher.decrypt(data)
|
||||||
|
|
||||||
|
|
||||||
|
class IdentityCryptFilter(object):
|
||||||
|
"""Identity crypt filter (pass through with no encryption)."""
|
||||||
|
def decrypt_data(self, num, gen, data):
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def decrypt_objects(objects, default_filter, filters):
|
||||||
|
"""Decrypt list of stream objects.
|
||||||
|
|
||||||
|
The parameter default_filter specifies the default filter to use. The
|
||||||
|
filters parameter is a dictionary of alternate filters to use when the
|
||||||
|
object specfies an alternate filter locally.
|
||||||
|
"""
|
||||||
|
for obj in streamobjects(objects):
|
||||||
|
if getattr(obj, 'decrypted', False):
|
||||||
|
continue
|
||||||
|
|
||||||
|
filter = default_filter
|
||||||
|
|
||||||
|
# Check whether a locally defined crypt filter should override the
|
||||||
|
# default filter.
|
||||||
|
ftype = obj.Filter
|
||||||
|
if ftype is not None:
|
||||||
|
if not isinstance(ftype, list):
|
||||||
|
ftype = [ftype]
|
||||||
|
if len(ftype) >= 1 and ftype[0] == PdfName.Crypt:
|
||||||
|
ftype = ftype[1:]
|
||||||
|
parms = obj.DecodeParms or obj.DP
|
||||||
|
filter = filters[parms.Name]
|
||||||
|
|
||||||
|
num, gen = obj.indirect
|
||||||
|
obj.stream = filter.decrypt_data(num, gen, obj.stream)
|
||||||
|
obj.private.decrypted = True
|
||||||
|
obj.Filter = ftype or None
|
|
@ -0,0 +1,41 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
'''
|
||||||
|
PDF Exceptions and error handling
|
||||||
|
'''
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
|
fmt = logging.Formatter('[%(levelname)s] %(filename)s:%(lineno)d %(message)s')
|
||||||
|
|
||||||
|
handler = logging.StreamHandler()
|
||||||
|
handler.setFormatter(fmt)
|
||||||
|
|
||||||
|
log = logging.getLogger('pdfrw')
|
||||||
|
log.setLevel(logging.WARNING)
|
||||||
|
log.addHandler(handler)
|
||||||
|
|
||||||
|
|
||||||
|
class PdfError(Exception):
|
||||||
|
"Abstract base class of exceptions thrown by this module"
|
||||||
|
|
||||||
|
def __init__(self, msg):
|
||||||
|
self.msg = msg
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.msg
|
||||||
|
|
||||||
|
|
||||||
|
class PdfParseError(PdfError):
|
||||||
|
"Error thrown by parser/tokenizer"
|
||||||
|
|
||||||
|
|
||||||
|
class PdfOutputError(PdfError):
|
||||||
|
"Error thrown by PDF writer"
|
||||||
|
|
||||||
|
|
||||||
|
class PdfNotImplementedError(PdfError):
|
||||||
|
"Error thrown on missing features"
|
|
@ -0,0 +1,137 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
''' This module contains a function to find all the XObjects
|
||||||
|
in a document, and another function that will wrap them
|
||||||
|
in page objects.
|
||||||
|
'''
|
||||||
|
|
||||||
|
from .objects import PdfDict, PdfArray, PdfName
|
||||||
|
|
||||||
|
|
||||||
|
def find_objects(source, valid_types=(PdfName.XObject, None),
|
||||||
|
valid_subtypes=(PdfName.Form, PdfName.Image),
|
||||||
|
no_follow=(PdfName.Parent,),
|
||||||
|
isinstance=isinstance, id=id, sorted=sorted,
|
||||||
|
reversed=reversed, PdfDict=PdfDict):
|
||||||
|
'''
|
||||||
|
Find all the objects of a particular kind in a document
|
||||||
|
or array. Defaults to looking for Form and Image XObjects.
|
||||||
|
|
||||||
|
This could be done recursively, but some PDFs
|
||||||
|
are quite deeply nested, so we do it without
|
||||||
|
recursion.
|
||||||
|
|
||||||
|
Note that we don't know exactly where things appear on pages,
|
||||||
|
but we aim for a sort order that is (a) mostly in document order,
|
||||||
|
and (b) reproducible. For arrays, objects are processed in
|
||||||
|
array order, and for dicts, they are processed in key order.
|
||||||
|
'''
|
||||||
|
container = (PdfDict, PdfArray)
|
||||||
|
|
||||||
|
# Allow passing a list of pages, or a dict
|
||||||
|
if isinstance(source, PdfDict):
|
||||||
|
source = [source]
|
||||||
|
else:
|
||||||
|
source = list(source)
|
||||||
|
|
||||||
|
visited = set()
|
||||||
|
source.reverse()
|
||||||
|
while source:
|
||||||
|
obj = source.pop()
|
||||||
|
if not isinstance(obj, container):
|
||||||
|
continue
|
||||||
|
myid = id(obj)
|
||||||
|
if myid in visited:
|
||||||
|
continue
|
||||||
|
visited.add(myid)
|
||||||
|
if isinstance(obj, PdfDict):
|
||||||
|
if obj.Type in valid_types and obj.Subtype in valid_subtypes:
|
||||||
|
yield obj
|
||||||
|
obj = [y for (x, y) in sorted(obj.iteritems())
|
||||||
|
if x not in no_follow]
|
||||||
|
else:
|
||||||
|
# TODO: This forces resolution of any indirect objects in
|
||||||
|
# the array. It may not be necessary. Don't know if
|
||||||
|
# reversed() does any voodoo underneath the hood.
|
||||||
|
# It's cheap enough for now, but might be removeable.
|
||||||
|
obj and obj[0]
|
||||||
|
source.extend(reversed(obj))
|
||||||
|
|
||||||
|
|
||||||
|
def wrap_object(obj, width, margin):
|
||||||
|
''' Wrap an xobj in its own page object.
|
||||||
|
'''
|
||||||
|
fmt = 'q %s 0 0 %s %s %s cm /MyImage Do Q'
|
||||||
|
contents = PdfDict(indirect=True)
|
||||||
|
subtype = obj.Subtype
|
||||||
|
if subtype == PdfName.Form:
|
||||||
|
contents._stream = obj.stream
|
||||||
|
contents.Length = obj.Length
|
||||||
|
contents.Filter = obj.Filter
|
||||||
|
contents.DecodeParms = obj.DecodeParms
|
||||||
|
resources = obj.Resources
|
||||||
|
mbox = obj.BBox
|
||||||
|
elif subtype == PdfName.Image: # Image
|
||||||
|
xoffset = margin[0]
|
||||||
|
yoffset = margin[1]
|
||||||
|
cw = width - margin[0] - margin[2]
|
||||||
|
iw, ih = float(obj.Width), float(obj.Height)
|
||||||
|
ch = 1.0 * cw / iw * ih
|
||||||
|
height = ch + margin[1] + margin[3]
|
||||||
|
p = tuple(('%.9f' % x).rstrip('0').rstrip('.') for x in (cw, ch, xoffset, yoffset))
|
||||||
|
contents.stream = fmt % p
|
||||||
|
resources = PdfDict(XObject=PdfDict(MyImage=obj))
|
||||||
|
mbox = PdfArray((0, 0, width, height))
|
||||||
|
else:
|
||||||
|
raise TypeError("Expected Form or Image XObject")
|
||||||
|
|
||||||
|
return PdfDict(
|
||||||
|
indirect=True,
|
||||||
|
Type=PdfName.Page,
|
||||||
|
MediaBox=mbox,
|
||||||
|
Resources=resources,
|
||||||
|
Contents=contents,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def trivial_xobjs(maxignore=300):
|
||||||
|
''' Ignore XObjects that trivially contain other XObjects.
|
||||||
|
'''
|
||||||
|
ignore = set('q Q cm Do'.split())
|
||||||
|
Image = PdfName.Image
|
||||||
|
|
||||||
|
def check(obj):
|
||||||
|
if obj.Subtype == Image:
|
||||||
|
return False
|
||||||
|
s = obj.stream
|
||||||
|
if len(s) < maxignore:
|
||||||
|
s = (x for x in s.split() if not x.startswith('/') and
|
||||||
|
x not in ignore)
|
||||||
|
s = (x.replace('.', '').replace('-', '') for x in s)
|
||||||
|
if not [x for x in s if not x.isdigit()]:
|
||||||
|
return True
|
||||||
|
return check
|
||||||
|
|
||||||
|
|
||||||
|
def page_per_xobj(xobj_iter, width=8.5 * 72, margin=0.0 * 72,
|
||||||
|
image_only=False, ignore=trivial_xobjs(),
|
||||||
|
wrap_object=wrap_object):
|
||||||
|
''' page_per_xobj wraps every XObj found
|
||||||
|
in its own page object.
|
||||||
|
width and margin are used to set image sizes.
|
||||||
|
'''
|
||||||
|
try:
|
||||||
|
iter(margin)
|
||||||
|
except:
|
||||||
|
margin = [margin]
|
||||||
|
while len(margin) < 4:
|
||||||
|
margin *= 2
|
||||||
|
|
||||||
|
if isinstance(xobj_iter, (list, dict)):
|
||||||
|
xobj_iter = find_objects(xobj_iter)
|
||||||
|
for obj in xobj_iter:
|
||||||
|
if not ignore(obj):
|
||||||
|
if not image_only or obj.Subtype == PdfName.IMage:
|
||||||
|
yield wrap_object(obj, width, margin)
|
|
@ -0,0 +1,19 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
'''
|
||||||
|
Objects that can occur in PDF files. The most important
|
||||||
|
objects are arrays and dicts. Either of these can be
|
||||||
|
indirect or not, and dicts could have an associated
|
||||||
|
stream.
|
||||||
|
'''
|
||||||
|
from .pdfname import PdfName
|
||||||
|
from .pdfdict import PdfDict, IndirectPdfDict
|
||||||
|
from .pdfarray import PdfArray
|
||||||
|
from .pdfobject import PdfObject
|
||||||
|
from .pdfstring import PdfString
|
||||||
|
from .pdfindirect import PdfIndirect
|
||||||
|
|
||||||
|
__all__ = """PdfName PdfDict IndirectPdfDict PdfArray
|
||||||
|
PdfObject PdfString PdfIndirect""".split()
|
|
@ -0,0 +1,71 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
from .pdfindirect import PdfIndirect
|
||||||
|
from .pdfobject import PdfObject
|
||||||
|
|
||||||
|
|
||||||
|
def _resolved():
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PdfArray(list):
|
||||||
|
''' A PdfArray maps the PDF file array object into a Python list.
|
||||||
|
It has an indirect attribute which defaults to False.
|
||||||
|
'''
|
||||||
|
indirect = False
|
||||||
|
|
||||||
|
def __init__(self, source=[]):
|
||||||
|
self._resolve = self._resolver
|
||||||
|
self.extend(source)
|
||||||
|
|
||||||
|
def _resolver(self, isinstance=isinstance, enumerate=enumerate,
|
||||||
|
listiter=list.__iter__, PdfIndirect=PdfIndirect,
|
||||||
|
resolved=_resolved, PdfNull=PdfObject('null')):
|
||||||
|
for index, value in enumerate(list.__iter__(self)):
|
||||||
|
if isinstance(value, PdfIndirect):
|
||||||
|
value = value.real_value()
|
||||||
|
if value is None:
|
||||||
|
value = PdfNull
|
||||||
|
self[index] = value
|
||||||
|
self._resolve = resolved
|
||||||
|
|
||||||
|
def __getitem__(self, index, listget=list.__getitem__):
|
||||||
|
self._resolve()
|
||||||
|
return listget(self, index)
|
||||||
|
|
||||||
|
try:
|
||||||
|
def __getslice__(self, i, j, listget=list.__getslice__):
|
||||||
|
self._resolve()
|
||||||
|
return listget(self, i, j)
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __iter__(self, listiter=list.__iter__):
|
||||||
|
self._resolve()
|
||||||
|
return listiter(self)
|
||||||
|
|
||||||
|
def count(self, item):
|
||||||
|
self._resolve()
|
||||||
|
return list.count(self, item)
|
||||||
|
|
||||||
|
def index(self, item):
|
||||||
|
self._resolve()
|
||||||
|
return list.index(self, item)
|
||||||
|
|
||||||
|
def remove(self, item):
|
||||||
|
self._resolve()
|
||||||
|
return list.remove(self, item)
|
||||||
|
|
||||||
|
def sort(self, *args, **kw):
|
||||||
|
self._resolve()
|
||||||
|
return list.sort(self, *args, **kw)
|
||||||
|
|
||||||
|
def pop(self, *args):
|
||||||
|
self._resolve()
|
||||||
|
return list.pop(self, *args)
|
||||||
|
|
||||||
|
def __reversed__(self):
|
||||||
|
self._resolve()
|
||||||
|
return list.__reversed__(self)
|
|
@ -0,0 +1,241 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
from .pdfname import PdfName, BasePdfName
|
||||||
|
from .pdfindirect import PdfIndirect
|
||||||
|
from .pdfobject import PdfObject
|
||||||
|
from ..py23_diffs import iteritems
|
||||||
|
from ..errors import PdfParseError
|
||||||
|
|
||||||
|
|
||||||
|
class _DictSearch(object):
|
||||||
|
''' Used to search for inheritable attributes.
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self, basedict):
|
||||||
|
self.basedict = basedict
|
||||||
|
|
||||||
|
def __getattr__(self, name, PdfName=PdfName):
|
||||||
|
return self[PdfName(name)]
|
||||||
|
|
||||||
|
def __getitem__(self, name, set=set, getattr=getattr, id=id):
|
||||||
|
visited = set()
|
||||||
|
mydict = self.basedict
|
||||||
|
while 1:
|
||||||
|
value = mydict[name]
|
||||||
|
if value is not None:
|
||||||
|
return value
|
||||||
|
myid = id(mydict)
|
||||||
|
assert myid not in visited
|
||||||
|
visited.add(myid)
|
||||||
|
mydict = mydict.Parent
|
||||||
|
if mydict is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
class _Private(object):
|
||||||
|
''' Used to store private attributes (not output to PDF files)
|
||||||
|
on PdfDict classes
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self, pdfdict):
|
||||||
|
vars(self)['pdfdict'] = pdfdict
|
||||||
|
|
||||||
|
def __setattr__(self, name, value):
|
||||||
|
vars(self.pdfdict)[name] = value
|
||||||
|
|
||||||
|
|
||||||
|
class PdfDict(dict):
|
||||||
|
''' PdfDict objects are subclassed dictionaries
|
||||||
|
with the following features:
|
||||||
|
|
||||||
|
- Every key in the dictionary starts with "/"
|
||||||
|
|
||||||
|
- A dictionary item can be deleted by assigning it to None
|
||||||
|
|
||||||
|
- Keys that (after the initial "/") conform to Python
|
||||||
|
naming conventions can also be accessed (set and retrieved)
|
||||||
|
as attributes of the dictionary. E.g. mydict.Page is the
|
||||||
|
same thing as mydict['/Page']
|
||||||
|
|
||||||
|
- Private attributes (not in the PDF space) can be set
|
||||||
|
on the dictionary object attribute dictionary by using
|
||||||
|
the private attribute:
|
||||||
|
|
||||||
|
mydict.private.foo = 3
|
||||||
|
mydict.foo = 5
|
||||||
|
x = mydict.foo # x will now contain 3
|
||||||
|
y = mydict['/foo'] # y will now contain 5
|
||||||
|
|
||||||
|
Most standard adobe dictionary keys start with an upper case letter,
|
||||||
|
so to avoid conflicts, it is best to start private attributes with
|
||||||
|
lower case letters.
|
||||||
|
|
||||||
|
- PdfDicts have the following read-only properties:
|
||||||
|
|
||||||
|
- private -- as discussed above, provides write access to
|
||||||
|
dictionary's attributes
|
||||||
|
- inheritable -- this creates and returns a "view" attribute
|
||||||
|
that will search through the object hierarchy for
|
||||||
|
any desired attribute, such as /Rotate or /MediaBox
|
||||||
|
|
||||||
|
- PdfDicts also have the following special attributes:
|
||||||
|
- indirect is not stored in the PDF dictionary, but in the object's
|
||||||
|
attribute dictionary
|
||||||
|
- stream is also stored in the object's attribute dictionary
|
||||||
|
and will also update the stream length.
|
||||||
|
- _stream will store in the object's attribute dictionary without
|
||||||
|
updating the stream length.
|
||||||
|
|
||||||
|
It is possible, for example, to have a PDF name such as "/indirect"
|
||||||
|
or "/stream", but you cannot access such a name as an attribute:
|
||||||
|
|
||||||
|
mydict.indirect -- accesses object's attribute dictionary
|
||||||
|
mydict["/indirect"] -- accesses actual PDF dictionary
|
||||||
|
'''
|
||||||
|
indirect = False
|
||||||
|
stream = None
|
||||||
|
|
||||||
|
_special = dict(indirect=('indirect', False),
|
||||||
|
stream=('stream', True),
|
||||||
|
_stream=('stream', False),
|
||||||
|
)
|
||||||
|
|
||||||
|
def __setitem__(self, name, value, setter=dict.__setitem__,
|
||||||
|
BasePdfName=BasePdfName, isinstance=isinstance):
|
||||||
|
if not isinstance(name, BasePdfName):
|
||||||
|
raise PdfParseError('Dict key %s is not a PdfName' % repr(name))
|
||||||
|
if value is not None:
|
||||||
|
setter(self, name, value)
|
||||||
|
elif name in self:
|
||||||
|
del self[name]
|
||||||
|
|
||||||
|
def __init__(self, *args, **kw):
|
||||||
|
if args:
|
||||||
|
if len(args) == 1:
|
||||||
|
args = args[0]
|
||||||
|
self.update(args)
|
||||||
|
if isinstance(args, PdfDict):
|
||||||
|
self.indirect = args.indirect
|
||||||
|
self._stream = args.stream
|
||||||
|
for key, value in iteritems(kw):
|
||||||
|
setattr(self, key, value)
|
||||||
|
|
||||||
|
def __getattr__(self, name, PdfName=PdfName):
|
||||||
|
''' If the attribute doesn't exist on the dictionary object,
|
||||||
|
try to slap a '/' in front of it and get it out
|
||||||
|
of the actual dictionary itself.
|
||||||
|
'''
|
||||||
|
return self.get(PdfName(name))
|
||||||
|
|
||||||
|
def get(self, key, dictget=dict.get, isinstance=isinstance,
|
||||||
|
PdfIndirect=PdfIndirect):
|
||||||
|
''' Get a value out of the dictionary,
|
||||||
|
after resolving any indirect objects.
|
||||||
|
'''
|
||||||
|
value = dictget(self, key)
|
||||||
|
if isinstance(value, PdfIndirect):
|
||||||
|
# We used to use self[key] here, but that does an
|
||||||
|
# unwanted check on the type of the key (github issue #98).
|
||||||
|
# Python will keep the old key object in the dictionary,
|
||||||
|
# so that check is not necessary.
|
||||||
|
value = value.real_value()
|
||||||
|
if value is not None:
|
||||||
|
dict.__setitem__(self, key, value)
|
||||||
|
else:
|
||||||
|
del self[name]
|
||||||
|
return value
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
return self.get(key)
|
||||||
|
|
||||||
|
def __setattr__(self, name, value, special=_special.get,
|
||||||
|
PdfName=PdfName, vars=vars):
|
||||||
|
''' Set an attribute on the dictionary. Handle the keywords
|
||||||
|
indirect, stream, and _stream specially (for content objects)
|
||||||
|
'''
|
||||||
|
info = special(name)
|
||||||
|
if info is None:
|
||||||
|
self[PdfName(name)] = value
|
||||||
|
else:
|
||||||
|
name, setlen = info
|
||||||
|
vars(self)[name] = value
|
||||||
|
if setlen:
|
||||||
|
notnone = value is not None
|
||||||
|
self.Length = notnone and PdfObject(len(value)) or None
|
||||||
|
|
||||||
|
def iteritems(self, dictiter=iteritems,
|
||||||
|
isinstance=isinstance, PdfIndirect=PdfIndirect,
|
||||||
|
BasePdfName=BasePdfName):
|
||||||
|
''' Iterate over the dictionary, resolving any unresolved objects
|
||||||
|
'''
|
||||||
|
for key, value in list(dictiter(self)):
|
||||||
|
if isinstance(value, PdfIndirect):
|
||||||
|
self[key] = value = value.real_value()
|
||||||
|
if value is not None:
|
||||||
|
if not isinstance(key, BasePdfName):
|
||||||
|
raise PdfParseError('Dict key %s is not a PdfName' %
|
||||||
|
repr(key))
|
||||||
|
yield key, value
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
return list(self.iteritems())
|
||||||
|
|
||||||
|
def itervalues(self):
|
||||||
|
for key, value in self.iteritems():
|
||||||
|
yield value
|
||||||
|
|
||||||
|
def values(self):
|
||||||
|
return list((value for key, value in self.iteritems()))
|
||||||
|
|
||||||
|
def keys(self):
|
||||||
|
return list((key for key, value in self.iteritems()))
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
for key, value in self.iteritems():
|
||||||
|
yield key
|
||||||
|
|
||||||
|
def iterkeys(self):
|
||||||
|
return iter(self)
|
||||||
|
|
||||||
|
def copy(self):
|
||||||
|
return type(self)(self)
|
||||||
|
|
||||||
|
def pop(self, key):
|
||||||
|
value = self.get(key)
|
||||||
|
del self[key]
|
||||||
|
return value
|
||||||
|
|
||||||
|
def popitem(self):
|
||||||
|
key, value = dict.pop(self)
|
||||||
|
if isinstance(value, PdfIndirect):
|
||||||
|
value = value.real_value()
|
||||||
|
return value
|
||||||
|
|
||||||
|
def inheritable(self):
|
||||||
|
''' Search through ancestors as needed for inheritable
|
||||||
|
dictionary items.
|
||||||
|
NOTE: You might think it would be a good idea
|
||||||
|
to cache this class, but then you'd have to worry
|
||||||
|
about it pointing to the wrong dictionary if you
|
||||||
|
made a copy of the object...
|
||||||
|
'''
|
||||||
|
return _DictSearch(self)
|
||||||
|
inheritable = property(inheritable)
|
||||||
|
|
||||||
|
def private(self):
|
||||||
|
''' Allows setting private metadata for use in
|
||||||
|
processing (not sent to PDF file).
|
||||||
|
See note on inheritable
|
||||||
|
'''
|
||||||
|
return _Private(self)
|
||||||
|
private = property(private)
|
||||||
|
|
||||||
|
|
||||||
|
class IndirectPdfDict(PdfDict):
|
||||||
|
''' IndirectPdfDict is a convenience class. You could
|
||||||
|
create a direct PdfDict and then set indirect = True on it,
|
||||||
|
or you could just create an IndirectPdfDict.
|
||||||
|
'''
|
||||||
|
indirect = True
|
|
@ -0,0 +1,22 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
|
||||||
|
class _NotLoaded(object):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PdfIndirect(tuple):
|
||||||
|
''' A placeholder for an object that hasn't been read in yet.
|
||||||
|
The object itself is the (object number, generation number) tuple.
|
||||||
|
The attributes include information about where the object is
|
||||||
|
referenced from and the file object to retrieve the real object from.
|
||||||
|
'''
|
||||||
|
value = _NotLoaded
|
||||||
|
|
||||||
|
def real_value(self, NotLoaded=_NotLoaded):
|
||||||
|
value = self.value
|
||||||
|
if value is NotLoaded:
|
||||||
|
value = self.value = self._loader(self)
|
||||||
|
return value
|
|
@ -0,0 +1,81 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from ..errors import log
|
||||||
|
|
||||||
|
warn = log.warning
|
||||||
|
|
||||||
|
|
||||||
|
class BasePdfName(str):
|
||||||
|
''' A PdfName is an identifier that starts with
|
||||||
|
a slash.
|
||||||
|
|
||||||
|
If a PdfName has illegal space or delimiter characters,
|
||||||
|
then it will be decorated with an "encoded" attribute that
|
||||||
|
has those characters properly escaped as #<hex><hex>
|
||||||
|
|
||||||
|
The "encoded" attribute is what is sent out to a PDF file,
|
||||||
|
the non-encoded main object is what is compared for equality
|
||||||
|
in a PDF dictionary.
|
||||||
|
'''
|
||||||
|
|
||||||
|
indirect = False
|
||||||
|
encoded = None
|
||||||
|
|
||||||
|
whitespace = '\x00 \t\f\r\n'
|
||||||
|
delimiters = '()<>{}[]/%'
|
||||||
|
forbidden = list(whitespace) + list('\\' + x for x in delimiters)
|
||||||
|
remap = dict((x, '#%02X' % ord(x)) for x in (whitespace + delimiters))
|
||||||
|
split_to_encode = re.compile('(%s)' % '|'.join(forbidden)).split
|
||||||
|
split_to_decode = re.compile(r'\#([0-9A-Fa-f]{2})').split
|
||||||
|
|
||||||
|
def __new__(cls, name, pre_encoded=True, remap=remap,
|
||||||
|
join=''.join, new=str.__new__, chr=chr, int=int,
|
||||||
|
split_to_encode=split_to_encode,
|
||||||
|
split_to_decode=split_to_decode,
|
||||||
|
):
|
||||||
|
''' We can build a PdfName from scratch, or from
|
||||||
|
a pre-encoded name (e.g. coming in from a file).
|
||||||
|
'''
|
||||||
|
# Optimization for normal case
|
||||||
|
if name[1:].isalnum():
|
||||||
|
return new(cls, name)
|
||||||
|
encoded = name
|
||||||
|
if pre_encoded:
|
||||||
|
if '#' in name:
|
||||||
|
substrs = split_to_decode(name)
|
||||||
|
substrs[1::2] = (chr(int(x, 16)) for x in substrs[1::2])
|
||||||
|
name = join(substrs)
|
||||||
|
else:
|
||||||
|
encoded = split_to_encode(encoded)
|
||||||
|
encoded[3::2] = (remap[x] for x in encoded[3::2])
|
||||||
|
encoded = join(encoded)
|
||||||
|
self = new(cls, name)
|
||||||
|
if encoded != name:
|
||||||
|
self.encoded = encoded
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
# We could have used a metaclass, but this matches what
|
||||||
|
# we were doing historically.
|
||||||
|
|
||||||
|
class PdfName(object):
|
||||||
|
''' Two simple ways to get a PDF name from a string:
|
||||||
|
|
||||||
|
x = PdfName.FooBar
|
||||||
|
x = pdfName('FooBar')
|
||||||
|
|
||||||
|
Either technique will return "/FooBar"
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __getattr__(self, name, BasePdfName=BasePdfName):
|
||||||
|
return BasePdfName('/' + name, False)
|
||||||
|
|
||||||
|
def __call__(self, name, BasePdfName=BasePdfName):
|
||||||
|
return BasePdfName('/' + name, False)
|
||||||
|
|
||||||
|
PdfName = PdfName()
|
|
@ -0,0 +1,11 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
|
||||||
|
class PdfObject(str):
|
||||||
|
''' A PdfObject is a textual representation of any PDF file object
|
||||||
|
other than an array, dict or string. It has an indirect attribute
|
||||||
|
which defaults to False.
|
||||||
|
'''
|
||||||
|
indirect = False
|
|
@ -0,0 +1,553 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas
|
||||||
|
# 2016 James Laird-Wah, Sydney, Australia
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
================================
|
||||||
|
PdfString encoding and decoding
|
||||||
|
================================
|
||||||
|
|
||||||
|
Introduction
|
||||||
|
=============
|
||||||
|
|
||||||
|
|
||||||
|
This module handles encoding and decoding of PDF strings. PDF strings
|
||||||
|
are described in the PDF 1.7 reference manual, mostly in chapter 3
|
||||||
|
(sections 3.2 and 3.8) and chapter 5.
|
||||||
|
|
||||||
|
PDF strings are used in the document structure itself, and also inside
|
||||||
|
the stream of page contents dictionaries.
|
||||||
|
|
||||||
|
A PDF string can represent pure binary data (e.g. for a font or an
|
||||||
|
image), or text, or glyph indices. For Western fonts, the glyph indices
|
||||||
|
usually correspond to ASCII, but that is not guaranteed. (When it does
|
||||||
|
happen, it makes examination of raw PDF data a lot easier.)
|
||||||
|
|
||||||
|
The specification defines PDF string encoding at two different levels.
|
||||||
|
At the bottom, it defines ways to encode arbitrary bytes so that a PDF
|
||||||
|
tokenizer can understand they are a string of some sort, and can figure
|
||||||
|
out where the string begins and ends. (That is all the tokenizer itself
|
||||||
|
cares about.) Above that level, if the string represents text, the
|
||||||
|
specification defines ways to encode Unicode text into raw bytes, before
|
||||||
|
the byte encoding is performed.
|
||||||
|
|
||||||
|
There are two ways to do the byte encoding, and two ways to do the text
|
||||||
|
(Unicode) encoding.
|
||||||
|
|
||||||
|
Encoding bytes into PDF strings
|
||||||
|
================================
|
||||||
|
|
||||||
|
Adobe calls the two ways to encode bytes into strings "Literal strings"
|
||||||
|
and "Hexadecimal strings."
|
||||||
|
|
||||||
|
Literal strings
|
||||||
|
------------------
|
||||||
|
|
||||||
|
A literal string is delimited by ASCII parentheses ("(" and ")"), and a
|
||||||
|
hexadecimal string is delimited by ASCII less-than and greater-than
|
||||||
|
signs ("<" and ">").
|
||||||
|
|
||||||
|
A literal string may encode bytes almost unmolested. The caveat is
|
||||||
|
that if a byte has the same value as a parenthesis, it must be escaped
|
||||||
|
so that the tokenizer knows the string is not finished. This is accomplished
|
||||||
|
by using the ASCII backslash ("\") as an escape character. Of course,
|
||||||
|
now any backslash appearing in the data must likewise be escaped.
|
||||||
|
|
||||||
|
Hexadecimal strings
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
A hexadecimal string requires twice as much space as the source data
|
||||||
|
it represents (plus two bytes for the delimiter), simply storing each
|
||||||
|
byte as two hexadecimal digits, most significant digit first. The spec
|
||||||
|
allows for lower or upper case hex digits, but most PDF encoders seem
|
||||||
|
to use upper case.
|
||||||
|
|
||||||
|
Special cases -- Legacy systems and readability
|
||||||
|
-----------------------------------------------
|
||||||
|
|
||||||
|
It is possible to create a PDF document that uses 7 bit ASCII encoding,
|
||||||
|
and it is desirable in many cases to create PDFs that are reasonably
|
||||||
|
readable when opened in a text editor. For these reasons, the syntax
|
||||||
|
for both literal strings and hexadecimal strings is slightly more
|
||||||
|
complicated that the initial description above. In general, the additional
|
||||||
|
syntax allows the following features:
|
||||||
|
|
||||||
|
- Making the delineation between characters, or between sections of
|
||||||
|
a string, apparent, and easy to see in an editor.
|
||||||
|
- Keeping output lines from getting too wide for some editors
|
||||||
|
- Keeping output lines from being so narrow that you can only see the
|
||||||
|
small fraction of a string at a time in an editor.
|
||||||
|
- Suppressing unprintable characters
|
||||||
|
- Restricting the output string to 7 bit ASCII
|
||||||
|
|
||||||
|
Hexadecimal readability
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
For hexadecimal strings, only the first two bullets are relevant. The syntax
|
||||||
|
to accomplish this is simple, allowing any ASCII whitespace to be inserted
|
||||||
|
anywhere in the encoded hex string.
|
||||||
|
|
||||||
|
Literal readability
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
For literal strings, all of the bullets except the first are relevant.
|
||||||
|
The syntax has two methods to help with these goals. The first method
|
||||||
|
is to overload the escape operator to be able to do different functions,
|
||||||
|
and the second method can reduce the number of escapes required for
|
||||||
|
parentheses in the normal case.
|
||||||
|
|
||||||
|
The escape function works differently, depending on what byte follows
|
||||||
|
the backslash. In all cases, the escaping backslash is discarded,
|
||||||
|
and then the next character is examined:
|
||||||
|
|
||||||
|
- For parentheses and backslashes (and, in fact, for all characters
|
||||||
|
not described otherwise in this list), the character after the
|
||||||
|
backslash is preserved in the output.
|
||||||
|
- A letter from the set of "nrtbf" following a backslash is interpreted as
|
||||||
|
a line feed, carriage return, tab, backspace, or form-feed, respectively.
|
||||||
|
- One to three octal digits following the backslash indicate the
|
||||||
|
numeric value of the encoded byte.
|
||||||
|
- A carriage return, carriage return/line feed, or line feed following
|
||||||
|
the backslash indicates a line break that was put in for readability,
|
||||||
|
and that is not part of the actual data, so this is discarded.
|
||||||
|
|
||||||
|
The second method that can be used to improve readability (and reduce space)
|
||||||
|
in literal strings is to not escape parentheses. This only works, and is
|
||||||
|
only allowed, when the parentheses are properly balanced. For example,
|
||||||
|
"((Hello))" is a valid encoding for a literal string, but "((Hello)" is not;
|
||||||
|
the latter case should be encoded "(\(Hello)"
|
||||||
|
|
||||||
|
Encoding text into strings
|
||||||
|
==========================
|
||||||
|
|
||||||
|
Section 3.8.1 of the PDF specification describes text strings.
|
||||||
|
|
||||||
|
The individual characters of a text string can all be considered to
|
||||||
|
be Unicode; Adobe specifies two different ways to encode these characters
|
||||||
|
into a string of bytes before further encoding the byte string as a
|
||||||
|
literal string or a hexadecimal string.
|
||||||
|
|
||||||
|
The first way to encode these strings is called PDFDocEncoding. This
|
||||||
|
is mostly a one-for-one mapping of bytes into single bytes, similar to
|
||||||
|
Latin-1. The representable character set is limited to the number of
|
||||||
|
characters that can fit in a byte, and this encoding cannot be used
|
||||||
|
with Unicode strings that start with the two characters making up the
|
||||||
|
UTF-16-BE BOM.
|
||||||
|
|
||||||
|
The second way to encode these strings is with UTF-16-BE. Text strings
|
||||||
|
encoded with this method must start with the BOM, and although the spec
|
||||||
|
does not appear to mandate that the resultant bytes be encoded into a
|
||||||
|
hexadecimal string, that seems to be the canonical way to do it.
|
||||||
|
|
||||||
|
When encoding a string into UTF-16-BE, this module always adds the BOM,
|
||||||
|
and when decoding a string from UTF-16-BE, this module always strips
|
||||||
|
the BOM. If a source string contains a BOM, that will remain in the
|
||||||
|
final string after a round-trip through the encoder and decoder, as
|
||||||
|
the goal of the encoding/decoding process is transparency.
|
||||||
|
|
||||||
|
|
||||||
|
PDF string handling in pdfrw
|
||||||
|
=============================
|
||||||
|
|
||||||
|
Responsibility for handling PDF strings in the pdfrw library is shared
|
||||||
|
between this module, the tokenizer, and the pdfwriter.
|
||||||
|
|
||||||
|
tokenizer string handling
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
As far as the tokenizer and its clients such as the pdfreader are concerned,
|
||||||
|
the PdfString class must simply be something that it can instantiate by
|
||||||
|
passing a string, that doesn't compare equal (or throw an exception when
|
||||||
|
compared) to other possible token strings. The tokenizer must understand
|
||||||
|
enough about the syntax of the string to successfully find its beginning
|
||||||
|
and end in a stream of tokens, but doesn't otherwise know or care about
|
||||||
|
the data represented by the string.
|
||||||
|
|
||||||
|
pdfwriter string handling
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
The pdfwriter knows and cares about two attributes of PdfString instances:
|
||||||
|
|
||||||
|
- First, PdfString objects have an 'indirect' attribute, which pdfwriter
|
||||||
|
uses as an indication that the object knows how to represent itself
|
||||||
|
correctly when output to a new PDF. (In the case of a PdfString object,
|
||||||
|
no work is really required, because it is already a string.)
|
||||||
|
- Second, the PdfString.encode() method is used as a convenience to
|
||||||
|
automatically convert any user-supplied strings (that didn't come
|
||||||
|
from PDFs) when a PDF is written out to a file.
|
||||||
|
|
||||||
|
pdfstring handling
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
The code in this module is designed to support those uses by the
|
||||||
|
tokenizer and the pdfwriter, and to additionally support encoding
|
||||||
|
and decoding of PdfString objects as a convenience for the user.
|
||||||
|
|
||||||
|
Most users of the pdfrw library never encode or decode a PdfString,
|
||||||
|
so it is imperative that (a) merely importing this module does not
|
||||||
|
take a significant amount of CPU time; and (b) it is cheap for the
|
||||||
|
tokenizer to produce a PdfString, and cheap for the pdfwriter to
|
||||||
|
consume a PdfString -- if the tokenizer finds a string that conforms
|
||||||
|
to the PDF specification, it will be wrapped in a PdfString object,
|
||||||
|
and if the pdfwriter finds an object with an indirect attribute, it
|
||||||
|
simply calls str() to ask it to format itself.
|
||||||
|
|
||||||
|
Encoding and decoding are not actually performed very often at all,
|
||||||
|
compared to how often tokenization and then subsequent concatenation
|
||||||
|
by the pdfwriter are performed. In fact, versions of pdfrw prior to
|
||||||
|
0.4 did not even support Unicode for this function. Encoding and
|
||||||
|
decoding can also easily be performed by the user, outside of the
|
||||||
|
library, and this might still be recommended, at least for encoding,
|
||||||
|
if the visual appeal of encodings generated by this module is found
|
||||||
|
lacking.
|
||||||
|
|
||||||
|
|
||||||
|
Decoding strings
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Decoding strings can be tricky, but is a bounded process. Each
|
||||||
|
properly-encoded encoded string represents exactly one output string,
|
||||||
|
with the caveat that is up to the caller of the function to know whether
|
||||||
|
he expects a Unicode string, or just bytes.
|
||||||
|
|
||||||
|
The caller can call PdfString.to_bytes() to get a byte string (which may
|
||||||
|
or may not represent encoded Unicode), or may call PdfString.to_unicode()
|
||||||
|
to get a Unicode string. Byte strings will be regular strings in Python 2,
|
||||||
|
and b'' bytes in Python 3; Unicode strings will be regular strings in
|
||||||
|
Python 3, and u'' unicode strings in Python 2.
|
||||||
|
|
||||||
|
To maintain application compatibility with earlier versions of pdfrw,
|
||||||
|
PdfString.decode() is an alias for PdfString.to_unicode().
|
||||||
|
|
||||||
|
Encoding strings
|
||||||
|
~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
PdfString has three factory functions that will encode strings into
|
||||||
|
PdfString objects:
|
||||||
|
|
||||||
|
- PdfString.from_bytes() accepts a byte string (regular string in Python 2
|
||||||
|
or b'' bytes string in Python 3) and returns a PdfString object.
|
||||||
|
- PdfString.from_unicode() accepts a Unicode string (u'' Unicode string in
|
||||||
|
Python 2 or regular string in Python 3) and returns a PdfString object.
|
||||||
|
- PdfString.encode() examines the type of object passed, and either
|
||||||
|
calls from_bytes() or from_unicode() to do the real work.
|
||||||
|
|
||||||
|
Unlike decoding(), encoding is not (mathematically) a function.
|
||||||
|
There are (literally) an infinite number of ways to encode any given
|
||||||
|
source string. (Of course, most of them would be stupid, unless
|
||||||
|
the intent is some sort of denial-of-service attack.)
|
||||||
|
|
||||||
|
So encoding strings is either simpler than decoding, or can be made to
|
||||||
|
be an open-ended science fair project (to create the best looking
|
||||||
|
encoded strings).
|
||||||
|
|
||||||
|
There are parameters to the encoding functions that allow control over
|
||||||
|
the final encoded string, but the intention is to make the default values
|
||||||
|
produce a reasonable encoding.
|
||||||
|
|
||||||
|
As mentioned previously, if encoding does not do what a particular
|
||||||
|
user needs, that user is free to write his own encoder, and then
|
||||||
|
simply instantiate a PdfString object by passing a string to the
|
||||||
|
default constructor, the same way that the tokenizer does it.
|
||||||
|
|
||||||
|
However, if desirable, encoding may gradually become more capable
|
||||||
|
over time, adding the ability to generate more aesthetically pleasing
|
||||||
|
encoded strings.
|
||||||
|
|
||||||
|
PDFDocString encoding and decoding
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
To handle this encoding in a fairly standard way, this module registers
|
||||||
|
an encoder and decoder for PDFDocEncoding with the codecs module.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import codecs
|
||||||
|
import binascii
|
||||||
|
import itertools
|
||||||
|
from ..py23_diffs import convert_load, convert_store
|
||||||
|
|
||||||
|
def find_pdfdocencoding(encoding):
|
||||||
|
""" This function conforms to the codec module registration
|
||||||
|
protocol. It defers calculating data structures until
|
||||||
|
a pdfdocencoding encode or decode is required.
|
||||||
|
|
||||||
|
PDFDocEncoding is described in the PDF 1.7 reference manual.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if encoding != 'pdfdocencoding':
|
||||||
|
return
|
||||||
|
|
||||||
|
# Create the decoding map based on the table in section D.2 of the
|
||||||
|
# PDF 1.7 manual
|
||||||
|
|
||||||
|
# Start off with the characters with 1:1 correspondence
|
||||||
|
decoding_map = set(range(0x20, 0x7F)) | set(range(0xA1, 0x100))
|
||||||
|
decoding_map.update((0x09, 0x0A, 0x0D))
|
||||||
|
decoding_map.remove(0xAD)
|
||||||
|
decoding_map = dict((x, x) for x in decoding_map)
|
||||||
|
|
||||||
|
# Add in the special Unicode characters
|
||||||
|
decoding_map.update(zip(range(0x18, 0x20), (
|
||||||
|
0x02D8, 0x02C7, 0x02C6, 0x02D9, 0x02DD, 0x02DB, 0x02DA, 0x02DC)))
|
||||||
|
decoding_map.update(zip(range(0x80, 0x9F), (
|
||||||
|
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
|
||||||
|
0x2039, 0x203A, 0x2212, 0x2030, 0x201E, 0x201C, 0x201D, 0x2018,
|
||||||
|
0x2019, 0x201A, 0x2122, 0xFB01, 0xFB02, 0x0141, 0x0152, 0x0160,
|
||||||
|
0x0178, 0x017D, 0x0131, 0x0142, 0x0153, 0x0161, 0x017E)))
|
||||||
|
decoding_map[0xA0] = 0x20AC
|
||||||
|
|
||||||
|
# Make the encoding map from the decoding map
|
||||||
|
encoding_map = codecs.make_encoding_map(decoding_map)
|
||||||
|
|
||||||
|
# Not every PDF producer follows the spec, so conform to Postel's law
|
||||||
|
# and interpret encoded strings if at all possible. In particular, they
|
||||||
|
# might have nulls and form-feeds, judging by random code snippets
|
||||||
|
# floating around the internet.
|
||||||
|
decoding_map.update(((x, x) for x in range(0x18)))
|
||||||
|
|
||||||
|
def encode(input, errors='strict'):
|
||||||
|
return codecs.charmap_encode(input, errors, encoding_map)
|
||||||
|
|
||||||
|
def decode(input, errors='strict'):
|
||||||
|
return codecs.charmap_decode(input, errors, decoding_map)
|
||||||
|
|
||||||
|
return codecs.CodecInfo(encode, decode, name='pdfdocencoding')
|
||||||
|
|
||||||
|
codecs.register(find_pdfdocencoding)
|
||||||
|
|
||||||
|
class PdfString(str):
|
||||||
|
""" A PdfString is an encoded string. It has a decode
|
||||||
|
method to get the actual string data out, and there
|
||||||
|
is an encode class method to create such a string.
|
||||||
|
Like any PDF object, it could be indirect, but it
|
||||||
|
defaults to being a direct object.
|
||||||
|
"""
|
||||||
|
indirect = False
|
||||||
|
|
||||||
|
|
||||||
|
# The byte order mark, and unicode that could be
|
||||||
|
# wrongly encoded into the byte order mark by the
|
||||||
|
# pdfdocencoding codec.
|
||||||
|
|
||||||
|
bytes_bom = codecs.BOM_UTF16_BE
|
||||||
|
bad_pdfdoc_prefix = bytes_bom.decode('latin-1')
|
||||||
|
|
||||||
|
# Used by decode_literal; filled in on first use
|
||||||
|
|
||||||
|
unescape_dict = None
|
||||||
|
unescape_func = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def init_unescapes(cls):
|
||||||
|
""" Sets up the unescape attributes for decode_literal
|
||||||
|
"""
|
||||||
|
unescape_pattern = r'\\([0-7]{1,3}|\r\n|.)'
|
||||||
|
unescape_func = re.compile(unescape_pattern, re.DOTALL).split
|
||||||
|
cls.unescape_func = unescape_func
|
||||||
|
|
||||||
|
unescape_dict = dict(((chr(x), chr(x)) for x in range(0x100)))
|
||||||
|
unescape_dict.update(zip('nrtbf', '\n\r\t\b\f'))
|
||||||
|
unescape_dict['\r'] = ''
|
||||||
|
unescape_dict['\n'] = ''
|
||||||
|
unescape_dict['\r\n'] = ''
|
||||||
|
for i in range(0o10):
|
||||||
|
unescape_dict['%01o' % i] = chr(i)
|
||||||
|
for i in range(0o100):
|
||||||
|
unescape_dict['%02o' % i] = chr(i)
|
||||||
|
for i in range(0o400):
|
||||||
|
unescape_dict['%03o' % i] = chr(i)
|
||||||
|
cls.unescape_dict = unescape_dict
|
||||||
|
return unescape_func
|
||||||
|
|
||||||
|
def decode_literal(self):
|
||||||
|
""" Decode a PDF literal string, which is enclosed in parentheses ()
|
||||||
|
|
||||||
|
Many pdfrw users never decode strings, so defer creating
|
||||||
|
data structures to do so until the first string is decoded.
|
||||||
|
|
||||||
|
Possible string escapes from the spec:
|
||||||
|
(PDF 1.7 Reference, section 3.2.3, page 53)
|
||||||
|
|
||||||
|
1. \[nrtbf\()]: simple escapes
|
||||||
|
2. \\d{1,3}: octal. Must be zero-padded to 3 digits
|
||||||
|
if followed by digit
|
||||||
|
3. \<end of line>: line continuation. We don't know the EOL
|
||||||
|
marker used in the PDF, so accept \r, \n, and \r\n.
|
||||||
|
4. Any other character following \ escape -- the backslash
|
||||||
|
is swallowed.
|
||||||
|
"""
|
||||||
|
result = (self.unescape_func or self.init_unescapes())(self[1:-1])
|
||||||
|
if len(result) == 1:
|
||||||
|
return convert_store(result[0])
|
||||||
|
unescape_dict = self.unescape_dict
|
||||||
|
result[1::2] = [unescape_dict[x] for x in result[1::2]]
|
||||||
|
return convert_store(''.join(result))
|
||||||
|
|
||||||
|
|
||||||
|
def decode_hex(self):
|
||||||
|
""" Decode a PDF hexadecimal-encoded string, which is enclosed
|
||||||
|
in angle brackets <>.
|
||||||
|
"""
|
||||||
|
hexstr = convert_store(''.join(self[1:-1].split()))
|
||||||
|
if len(hexstr) % 1: # odd number of chars indicates a truncated 0
|
||||||
|
hexstr += '0'
|
||||||
|
return binascii.unhexlify(hexstr)
|
||||||
|
|
||||||
|
|
||||||
|
def to_bytes(self):
|
||||||
|
""" Decode a PDF string to bytes. This is a convenience function
|
||||||
|
for user code, in that (as of pdfrw 0.3) it is never
|
||||||
|
actually used inside pdfrw.
|
||||||
|
"""
|
||||||
|
if self.startswith('(') and self.endswith(')'):
|
||||||
|
return self.decode_literal()
|
||||||
|
|
||||||
|
elif self.startswith('<') and self.endswith('>'):
|
||||||
|
return self.decode_hex()
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError('Invalid PDF string "%s"' % repr(self))
|
||||||
|
|
||||||
|
def to_unicode(self):
|
||||||
|
""" Decode a PDF string to a unicode string. This is a
|
||||||
|
convenience function for user code, in that (as of
|
||||||
|
pdfrw 0.3) it is never actually used inside pdfrw.
|
||||||
|
|
||||||
|
There are two Unicode storage methods used -- either
|
||||||
|
UTF16_BE, or something called PDFDocEncoding, which
|
||||||
|
is defined in the PDF spec. The determination of
|
||||||
|
which decoding method to use is done by examining the
|
||||||
|
first two bytes for the byte order marker.
|
||||||
|
"""
|
||||||
|
raw = self.to_bytes()
|
||||||
|
|
||||||
|
if raw[:2] == self.bytes_bom:
|
||||||
|
return raw[2:].decode('utf-16-be')
|
||||||
|
else:
|
||||||
|
return raw.decode('pdfdocencoding')
|
||||||
|
|
||||||
|
# Legacy-compatible interface
|
||||||
|
decode = to_unicode
|
||||||
|
|
||||||
|
# Internal value used by encoding
|
||||||
|
|
||||||
|
escape_splitter = None # Calculated on first use
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def init_escapes(cls):
|
||||||
|
""" Initialize the escape_splitter for the encode method
|
||||||
|
"""
|
||||||
|
cls.escape_splitter = re.compile(br'(\(|\\|\))').split
|
||||||
|
return cls.escape_splitter
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_bytes(cls, raw, bytes_encoding='auto'):
|
||||||
|
""" The from_bytes() constructor is called to encode a source raw
|
||||||
|
byte string into a PdfString that is suitable for inclusion
|
||||||
|
in a PDF.
|
||||||
|
|
||||||
|
NOTE: There is no magic in the encoding process. A user
|
||||||
|
can certainly do his own encoding, and simply initialize a
|
||||||
|
PdfString() instance with his encoded string. That may be
|
||||||
|
useful, for example, to add line breaks to make it easier
|
||||||
|
to load PDFs into editors, or to not bother to escape balanced
|
||||||
|
parentheses, or to escape additional characters to make a PDF
|
||||||
|
more readable in a file editor. Those are features not
|
||||||
|
currently supported by this method.
|
||||||
|
|
||||||
|
from_bytes() can use a heuristic to figure out the best
|
||||||
|
encoding for the string, or the user can control the process
|
||||||
|
by changing the bytes_encoding parameter to 'literal' or 'hex'
|
||||||
|
to force a particular conversion method.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# If hexadecimal is not being forced, then figure out how long
|
||||||
|
# the escaped literal string will be, and fall back to hex if
|
||||||
|
# it is too long.
|
||||||
|
|
||||||
|
force_hex = bytes_encoding == 'hex'
|
||||||
|
if not force_hex:
|
||||||
|
if bytes_encoding not in ('literal', 'auto'):
|
||||||
|
raise ValueError('Invalid bytes_encoding value: %s'
|
||||||
|
% bytes_encoding)
|
||||||
|
splitlist = (cls.escape_splitter or cls.init_escapes())(raw)
|
||||||
|
if bytes_encoding == 'auto' and len(splitlist) // 2 >= len(raw):
|
||||||
|
force_hex = True
|
||||||
|
|
||||||
|
if force_hex:
|
||||||
|
# The spec does not mandate uppercase,
|
||||||
|
# but it seems to be the convention.
|
||||||
|
fmt = '<%s>'
|
||||||
|
result = binascii.hexlify(raw).upper()
|
||||||
|
else:
|
||||||
|
fmt = '(%s)'
|
||||||
|
splitlist[1::2] = [(b'\\' + x) for x in splitlist[1::2]]
|
||||||
|
result = b''.join(splitlist)
|
||||||
|
|
||||||
|
return cls(fmt % convert_load(result))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_unicode(cls, source, text_encoding='auto',
|
||||||
|
bytes_encoding='auto'):
|
||||||
|
""" The from_unicode() constructor is called to encode a source
|
||||||
|
string into a PdfString that is suitable for inclusion in a PDF.
|
||||||
|
|
||||||
|
NOTE: There is no magic in the encoding process. A user
|
||||||
|
can certainly do his own encoding, and simply initialize a
|
||||||
|
PdfString() instance with his encoded string. That may be
|
||||||
|
useful, for example, to add line breaks to make it easier
|
||||||
|
to load PDFs into editors, or to not bother to escape balanced
|
||||||
|
parentheses, or to escape additional characters to make a PDF
|
||||||
|
more readable in a file editor. Those are features not
|
||||||
|
supported by this method.
|
||||||
|
|
||||||
|
from_unicode() can use a heuristic to figure out the best
|
||||||
|
encoding for the string, or the user can control the process
|
||||||
|
by changing the text_encoding parameter to 'pdfdocencoding'
|
||||||
|
or 'utf16', and/or by changing the bytes_encoding parameter
|
||||||
|
to 'literal' or 'hex' to force particular conversion methods.
|
||||||
|
|
||||||
|
The function will raise an exception if it cannot perform
|
||||||
|
the conversion as requested by the user.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Give preference to pdfdocencoding, since it only
|
||||||
|
# requires one raw byte per character, rather than two.
|
||||||
|
if text_encoding != 'utf16':
|
||||||
|
force_pdfdoc = text_encoding == 'pdfdocencoding'
|
||||||
|
if text_encoding != 'auto' and not force_pdfdoc:
|
||||||
|
raise ValueError('Invalid text_encoding value: %s'
|
||||||
|
% text_encoding)
|
||||||
|
|
||||||
|
if source.startswith(cls.bad_pdfdoc_prefix):
|
||||||
|
if force_pdfdoc:
|
||||||
|
raise UnicodeError('Prefix of string %r cannot be encoded '
|
||||||
|
'in pdfdocencoding' % source[:20])
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
raw = source.encode('pdfdocencoding')
|
||||||
|
except UnicodeError:
|
||||||
|
if force_pdfdoc:
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
return cls.from_bytes(raw, bytes_encoding)
|
||||||
|
|
||||||
|
# If the user is not forcing literal strings,
|
||||||
|
# it makes much more sense to use hexadecimal with 2-byte chars
|
||||||
|
raw = cls.bytes_bom + source.encode('utf-16-be')
|
||||||
|
encoding = 'hex' if bytes_encoding == 'auto' else bytes_encoding
|
||||||
|
return cls.from_bytes(raw, encoding)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def encode(cls, source, uni_type = type(u''), isinstance=isinstance):
|
||||||
|
""" The encode() constructor is a legacy function that is
|
||||||
|
also a convenience for the PdfWriter.
|
||||||
|
"""
|
||||||
|
if isinstance(source, uni_type):
|
||||||
|
return cls.from_unicode(source)
|
||||||
|
else:
|
||||||
|
return cls.from_bytes(source)
|
|
@ -0,0 +1,250 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
'''
|
||||||
|
This module contains code to edit pages. Sort of a canvas, I
|
||||||
|
suppose, but I wouldn't want to call it that and get people all
|
||||||
|
excited or anything.
|
||||||
|
|
||||||
|
No, this is just for doing basic things like merging/splitting
|
||||||
|
apart pages, watermarking, etc. All it does is allow converting
|
||||||
|
pages (or parts of pages) into Form XObject rectangles, and then
|
||||||
|
plopping those down on new or pre-existing pages.
|
||||||
|
'''
|
||||||
|
|
||||||
|
from .objects import PdfDict, PdfArray, PdfName
|
||||||
|
from .buildxobj import pagexobj, ViewInfo
|
||||||
|
|
||||||
|
NullInfo = ViewInfo()
|
||||||
|
|
||||||
|
|
||||||
|
class RectXObj(PdfDict):
|
||||||
|
''' This class facilitates doing positioning (moving and scaling)
|
||||||
|
of Form XObjects within their containing page, by modifying
|
||||||
|
the Form XObject's transformation matrix.
|
||||||
|
|
||||||
|
By default, this class keeps the aspect ratio locked. For
|
||||||
|
example, if your object is foo, you can write 'foo.w = 200',
|
||||||
|
and it will scale in both the x and y directions.
|
||||||
|
|
||||||
|
To unlock the aspect ration, you have to do a tiny bit of math
|
||||||
|
and call the scale function.
|
||||||
|
'''
|
||||||
|
def __init__(self, page, viewinfo=NullInfo, **kw):
|
||||||
|
''' The page is a page returned by PdfReader. It will be
|
||||||
|
turned into a cached Form XObject (so that multiple
|
||||||
|
rectangles can be extracted from it if desired), and then
|
||||||
|
another Form XObject will be built using it and the viewinfo
|
||||||
|
(which should be a ViewInfo class). The viewinfo includes
|
||||||
|
source coordinates (from the top/left) and rotation information.
|
||||||
|
|
||||||
|
Once the object has been built, its destination coordinates
|
||||||
|
may be examined and manipulated by using x, y, w, h, and
|
||||||
|
scale. The destination coordinates are in the normal
|
||||||
|
PDF programmatic system (starting at bottom left).
|
||||||
|
'''
|
||||||
|
if kw:
|
||||||
|
if viewinfo is not NullInfo:
|
||||||
|
raise ValueError("Cannot modify preexisting ViewInfo")
|
||||||
|
viewinfo = ViewInfo(**kw)
|
||||||
|
viewinfo.cacheable = False
|
||||||
|
base = pagexobj(page, viewinfo)
|
||||||
|
self.update(base)
|
||||||
|
self.indirect = True
|
||||||
|
self.stream = base.stream
|
||||||
|
private = self.private
|
||||||
|
private._rect = [base.x, base.y, base.w, base.h]
|
||||||
|
matrix = self.Matrix
|
||||||
|
if matrix is None:
|
||||||
|
matrix = self.Matrix = PdfArray((1, 0, 0, 1, 0, 0))
|
||||||
|
private._matrix = matrix # Lookup optimization
|
||||||
|
# Default to lower-left corner
|
||||||
|
self.x = 0
|
||||||
|
self.y = 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def x(self):
|
||||||
|
''' X location (from left) of object in points
|
||||||
|
'''
|
||||||
|
return self._rect[0]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def y(self):
|
||||||
|
''' Y location (from bottom) of object in points
|
||||||
|
'''
|
||||||
|
return self._rect[1]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def w(self):
|
||||||
|
''' Width of object in points
|
||||||
|
'''
|
||||||
|
return self._rect[2]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def h(self):
|
||||||
|
''' Height of object in points
|
||||||
|
'''
|
||||||
|
return self._rect[3]
|
||||||
|
|
||||||
|
def __setattr__(self, name, value, next=PdfDict.__setattr__,
|
||||||
|
mine=set('x y w h'.split())):
|
||||||
|
''' The underlying __setitem__ won't let us use a property
|
||||||
|
setter, so we have to fake one.
|
||||||
|
'''
|
||||||
|
if name not in mine:
|
||||||
|
return next(self, name, value)
|
||||||
|
if name in 'xy':
|
||||||
|
r_index, m_index = (0, 4) if name == 'x' else (1, 5)
|
||||||
|
self._rect[r_index], old = value, self._rect[r_index]
|
||||||
|
self._matrix[m_index] += value - old
|
||||||
|
else:
|
||||||
|
index = 2 + (value == 'h')
|
||||||
|
self.scale(value / self._rect[index])
|
||||||
|
|
||||||
|
def scale(self, x_scale, y_scale=None):
|
||||||
|
''' Current scaling deals properly with things that
|
||||||
|
have been rotated in 90 degree increments
|
||||||
|
(via the ViewMerge object given when instantiating).
|
||||||
|
'''
|
||||||
|
if y_scale is None:
|
||||||
|
y_scale = x_scale
|
||||||
|
x, y, w, h = rect = self._rect
|
||||||
|
ao, bo, co, do, eo, fo = matrix = self._matrix
|
||||||
|
an = ao * x_scale
|
||||||
|
bn = bo * y_scale
|
||||||
|
cn = co * x_scale
|
||||||
|
dn = do * y_scale
|
||||||
|
en = x + (eo - x) * 1.0 * (an + cn) / (ao + co)
|
||||||
|
fn = y + (fo - y) * 1.0 * (bn + dn) / (bo + do)
|
||||||
|
matrix[:] = an, bn, cn, dn, en, fn
|
||||||
|
rect[:] = x, y, w * x_scale, h * y_scale
|
||||||
|
|
||||||
|
@property
|
||||||
|
def box(self):
|
||||||
|
''' Return the bounding box for the object
|
||||||
|
'''
|
||||||
|
x, y, w, h = self._rect
|
||||||
|
return PdfArray([x, y, x + w, y + h])
|
||||||
|
|
||||||
|
|
||||||
|
class PageMerge(list):
|
||||||
|
''' A PageMerge object can have 0 or 1 underlying pages
|
||||||
|
(that get edited with the results of the merge)
|
||||||
|
and 0-n RectXObjs that can be applied before or
|
||||||
|
after the underlying page.
|
||||||
|
'''
|
||||||
|
page = None
|
||||||
|
mbox = None
|
||||||
|
cbox = None
|
||||||
|
resources = None
|
||||||
|
rotate = None
|
||||||
|
contents = None
|
||||||
|
|
||||||
|
def __init__(self, page=None):
|
||||||
|
if page is not None:
|
||||||
|
self.setpage(page)
|
||||||
|
|
||||||
|
def setpage(self, page):
|
||||||
|
if page.Type != PdfName.Page:
|
||||||
|
raise TypeError("Expected page")
|
||||||
|
self.append(None) # Placeholder
|
||||||
|
self.page = page
|
||||||
|
inheritable = page.inheritable
|
||||||
|
self.mbox = inheritable.MediaBox
|
||||||
|
self.cbox = inheritable.CropBox
|
||||||
|
self.resources = inheritable.Resources
|
||||||
|
self.rotate = inheritable.Rotate
|
||||||
|
self.contents = page.Contents
|
||||||
|
|
||||||
|
def __add__(self, other):
|
||||||
|
if isinstance(other, dict):
|
||||||
|
other = [other]
|
||||||
|
for other in other:
|
||||||
|
self.add(other)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def add(self, obj, prepend=False, **kw):
|
||||||
|
if kw:
|
||||||
|
obj = RectXObj(obj, **kw)
|
||||||
|
elif obj.Type == PdfName.Page:
|
||||||
|
obj = RectXObj(obj)
|
||||||
|
if prepend:
|
||||||
|
self.insert(0, obj)
|
||||||
|
else:
|
||||||
|
self.append(obj)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def render(self):
|
||||||
|
def do_xobjs(xobj_list, restore_first=False):
|
||||||
|
content = ['Q'] if restore_first else []
|
||||||
|
for obj in xobj_list:
|
||||||
|
index = PdfName('pdfrw_%d' % (key_offset + len(xobjs)))
|
||||||
|
if xobjs.setdefault(index, obj) is not obj:
|
||||||
|
raise KeyError("XObj key %s already in use" % index)
|
||||||
|
content.append('%s Do' % index)
|
||||||
|
return PdfDict(indirect=True, stream='\n'.join(content))
|
||||||
|
|
||||||
|
mbox = self.mbox
|
||||||
|
cbox = self.cbox
|
||||||
|
page = self.page
|
||||||
|
old_contents = self.contents
|
||||||
|
resources = self.resources or PdfDict()
|
||||||
|
|
||||||
|
key_offset = 0
|
||||||
|
xobjs = resources.XObject
|
||||||
|
if xobjs is None:
|
||||||
|
xobjs = resources.XObject = PdfDict()
|
||||||
|
else:
|
||||||
|
allkeys = xobjs.keys()
|
||||||
|
if allkeys:
|
||||||
|
keys = (x for x in allkeys if x.startswith('/pdfrw_'))
|
||||||
|
keys = (x for x in keys if x[7:].isdigit())
|
||||||
|
keys = sorted(keys, key=lambda x: int(x[7:]))
|
||||||
|
key_offset = (int(keys[-1][7:]) + 1) if keys else 0
|
||||||
|
key_offset -= len(allkeys)
|
||||||
|
|
||||||
|
if old_contents is None:
|
||||||
|
new_contents = do_xobjs(self)
|
||||||
|
else:
|
||||||
|
isdict = isinstance(old_contents, PdfDict)
|
||||||
|
old_contents = [old_contents] if isdict else old_contents
|
||||||
|
new_contents = PdfArray()
|
||||||
|
index = self.index(None)
|
||||||
|
if index:
|
||||||
|
new_contents.append(do_xobjs(self[:index]))
|
||||||
|
|
||||||
|
index += 1
|
||||||
|
if index < len(self):
|
||||||
|
# There are elements to add after the original page contents,
|
||||||
|
# so push the graphics state to the stack. Restored below.
|
||||||
|
new_contents.append(PdfDict(indirect=True, stream='q'))
|
||||||
|
|
||||||
|
new_contents.extend(old_contents)
|
||||||
|
|
||||||
|
if index < len(self):
|
||||||
|
# Restore graphics state and add other elements.
|
||||||
|
new_contents.append(do_xobjs(self[index:], restore_first=True))
|
||||||
|
|
||||||
|
if mbox is None:
|
||||||
|
cbox = None
|
||||||
|
mbox = self.xobj_box
|
||||||
|
mbox[0] = min(0, mbox[0])
|
||||||
|
mbox[1] = min(0, mbox[1])
|
||||||
|
|
||||||
|
page = PdfDict(indirect=True) if page is None else page
|
||||||
|
page.Type = PdfName.Page
|
||||||
|
page.Resources = resources
|
||||||
|
page.MediaBox = mbox
|
||||||
|
page.CropBox = cbox
|
||||||
|
page.Rotate = self.rotate
|
||||||
|
page.Contents = new_contents
|
||||||
|
return page
|
||||||
|
|
||||||
|
@property
|
||||||
|
def xobj_box(self):
|
||||||
|
''' Return the smallest box that encloses every object
|
||||||
|
in the list.
|
||||||
|
'''
|
||||||
|
a, b, c, d = zip(*(xobj.box for xobj in self))
|
||||||
|
return PdfArray((min(a), min(b), max(c), max(d)))
|
|
@ -0,0 +1,691 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
||||||
|
# Copyright (C) 2012-2015 Nerijus Mika
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
'''
|
||||||
|
The PdfReader class reads an entire PDF file into memory and
|
||||||
|
parses the top-level container objects. (It does not parse
|
||||||
|
into streams.) The object subclasses PdfDict, and the
|
||||||
|
document pages are stored in a list in the pages attribute
|
||||||
|
of the object.
|
||||||
|
'''
|
||||||
|
import gc
|
||||||
|
import binascii
|
||||||
|
import collections
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
from .errors import PdfParseError, log
|
||||||
|
from .tokens import PdfTokens
|
||||||
|
from .objects import PdfDict, PdfArray, PdfName, PdfObject, PdfIndirect
|
||||||
|
from .uncompress import uncompress
|
||||||
|
from . import crypt
|
||||||
|
from .py23_diffs import convert_load, convert_store, iteritems
|
||||||
|
|
||||||
|
|
||||||
|
class PdfReader(PdfDict):
|
||||||
|
|
||||||
|
def findindirect(self, objnum, gennum, PdfIndirect=PdfIndirect, int=int):
|
||||||
|
''' Return a previously loaded indirect object, or create
|
||||||
|
a placeholder for it.
|
||||||
|
'''
|
||||||
|
key = int(objnum), int(gennum)
|
||||||
|
result = self.indirect_objects.get(key)
|
||||||
|
if result is None:
|
||||||
|
self.indirect_objects[key] = result = PdfIndirect(key)
|
||||||
|
self.deferred_objects.add(key)
|
||||||
|
result._loader = self.loadindirect
|
||||||
|
return result
|
||||||
|
|
||||||
|
def readarray(self, source, PdfArray=PdfArray):
|
||||||
|
''' Found a [ token. Parse the tokens after that.
|
||||||
|
'''
|
||||||
|
specialget = self.special.get
|
||||||
|
result = []
|
||||||
|
pop = result.pop
|
||||||
|
append = result.append
|
||||||
|
|
||||||
|
for value in source:
|
||||||
|
if value in ']R':
|
||||||
|
if value == ']':
|
||||||
|
break
|
||||||
|
generation = pop()
|
||||||
|
value = self.findindirect(pop(), generation)
|
||||||
|
else:
|
||||||
|
func = specialget(value)
|
||||||
|
if func is not None:
|
||||||
|
value = func(source)
|
||||||
|
append(value)
|
||||||
|
return PdfArray(result)
|
||||||
|
|
||||||
|
def readdict(self, source, PdfDict=PdfDict):
|
||||||
|
''' Found a << token. Parse the tokens after that.
|
||||||
|
'''
|
||||||
|
specialget = self.special.get
|
||||||
|
result = PdfDict()
|
||||||
|
next = source.next
|
||||||
|
|
||||||
|
tok = next()
|
||||||
|
while tok != '>>':
|
||||||
|
if not tok.startswith('/'):
|
||||||
|
source.error('Expected PDF /name object')
|
||||||
|
tok = next()
|
||||||
|
continue
|
||||||
|
key = tok
|
||||||
|
value = next()
|
||||||
|
func = specialget(value)
|
||||||
|
if func is not None:
|
||||||
|
value = func(source)
|
||||||
|
tok = next()
|
||||||
|
else:
|
||||||
|
tok = next()
|
||||||
|
if value.isdigit() and tok.isdigit():
|
||||||
|
tok2 = next()
|
||||||
|
if tok2 != 'R':
|
||||||
|
source.error('Expected "R" following two integers')
|
||||||
|
tok = tok2
|
||||||
|
continue
|
||||||
|
value = self.findindirect(value, tok)
|
||||||
|
tok = next()
|
||||||
|
result[key] = value
|
||||||
|
return result
|
||||||
|
|
||||||
|
def empty_obj(self, source, PdfObject=PdfObject):
|
||||||
|
''' Some silly git put an empty object in the
|
||||||
|
file. Back up so the caller sees the endobj.
|
||||||
|
'''
|
||||||
|
source.floc = source.tokstart
|
||||||
|
|
||||||
|
def badtoken(self, source):
|
||||||
|
''' Didn't see that coming.
|
||||||
|
'''
|
||||||
|
source.exception('Unexpected delimiter')
|
||||||
|
|
||||||
|
def findstream(self, obj, tok, source, len=len):
|
||||||
|
''' Figure out if there is a content stream
|
||||||
|
following an object, and return the start
|
||||||
|
pointer to the content stream if so.
|
||||||
|
|
||||||
|
(We can't read it yet, because we might not
|
||||||
|
know how long it is, because Length might
|
||||||
|
be an indirect object.)
|
||||||
|
'''
|
||||||
|
|
||||||
|
fdata = source.fdata
|
||||||
|
startstream = source.tokstart + len(tok)
|
||||||
|
gotcr = fdata[startstream] == '\r'
|
||||||
|
startstream += gotcr
|
||||||
|
gotlf = fdata[startstream] == '\n'
|
||||||
|
startstream += gotlf
|
||||||
|
if not gotlf:
|
||||||
|
if not gotcr:
|
||||||
|
source.error(r'stream keyword not followed by \n')
|
||||||
|
else:
|
||||||
|
source.warning(r"stream keyword terminated "
|
||||||
|
r"by \r without \n")
|
||||||
|
return startstream
|
||||||
|
|
||||||
|
def readstream(self, obj, startstream, source, exact_required=False,
|
||||||
|
streamending='endstream endobj'.split(), int=int):
|
||||||
|
fdata = source.fdata
|
||||||
|
length = int(obj.Length)
|
||||||
|
source.floc = target_endstream = startstream + length
|
||||||
|
endit = source.multiple(2)
|
||||||
|
obj._stream = fdata[startstream:target_endstream]
|
||||||
|
if endit == streamending:
|
||||||
|
return
|
||||||
|
|
||||||
|
if exact_required:
|
||||||
|
source.exception('Expected endstream endobj')
|
||||||
|
|
||||||
|
# The length attribute does not match the distance between the
|
||||||
|
# stream and endstream keywords.
|
||||||
|
|
||||||
|
# TODO: Extract maxstream from dictionary of object offsets
|
||||||
|
# and use rfind instead of find.
|
||||||
|
maxstream = len(fdata) - 20
|
||||||
|
endstream = fdata.find('endstream', startstream, maxstream)
|
||||||
|
source.floc = startstream
|
||||||
|
room = endstream - startstream
|
||||||
|
if endstream < 0:
|
||||||
|
source.error('Could not find endstream')
|
||||||
|
return
|
||||||
|
if (length == room + 1 and
|
||||||
|
fdata[startstream - 2:startstream] == '\r\n'):
|
||||||
|
source.warning(r"stream keyword terminated by \r without \n")
|
||||||
|
obj._stream = fdata[startstream - 1:target_endstream - 1]
|
||||||
|
return
|
||||||
|
source.floc = endstream
|
||||||
|
if length > room:
|
||||||
|
source.error('stream /Length attribute (%d) appears to '
|
||||||
|
'be too big (size %d) -- adjusting',
|
||||||
|
length, room)
|
||||||
|
obj.stream = fdata[startstream:endstream]
|
||||||
|
return
|
||||||
|
if fdata[target_endstream:endstream].rstrip():
|
||||||
|
source.error('stream /Length attribute (%d) appears to '
|
||||||
|
'be too small (size %d) -- adjusting',
|
||||||
|
length, room)
|
||||||
|
obj.stream = fdata[startstream:endstream]
|
||||||
|
return
|
||||||
|
endobj = fdata.find('endobj', endstream, maxstream)
|
||||||
|
if endobj < 0:
|
||||||
|
source.error('Could not find endobj after endstream')
|
||||||
|
return
|
||||||
|
if fdata[endstream:endobj].rstrip() != 'endstream':
|
||||||
|
source.error('Unexpected data between endstream and endobj')
|
||||||
|
return
|
||||||
|
source.error('Illegal endstream/endobj combination')
|
||||||
|
|
||||||
|
def loadindirect(self, key, PdfDict=PdfDict,
|
||||||
|
isinstance=isinstance):
|
||||||
|
result = self.indirect_objects.get(key)
|
||||||
|
if not isinstance(result, PdfIndirect):
|
||||||
|
return result
|
||||||
|
source = self.source
|
||||||
|
offset = int(self.source.obj_offsets.get(key, '0'))
|
||||||
|
if not offset:
|
||||||
|
source.warning("Did not find PDF object %s", key)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Read the object header and validate it
|
||||||
|
objnum, gennum = key
|
||||||
|
source.floc = offset
|
||||||
|
objid = source.multiple(3)
|
||||||
|
ok = len(objid) == 3
|
||||||
|
ok = ok and objid[0].isdigit() and int(objid[0]) == objnum
|
||||||
|
ok = ok and objid[1].isdigit() and int(objid[1]) == gennum
|
||||||
|
ok = ok and objid[2] == 'obj'
|
||||||
|
if not ok:
|
||||||
|
source.floc = offset
|
||||||
|
source.next()
|
||||||
|
objheader = '%d %d obj' % (objnum, gennum)
|
||||||
|
fdata = source.fdata
|
||||||
|
offset2 = (fdata.find('\n' + objheader) + 1 or
|
||||||
|
fdata.find('\r' + objheader) + 1)
|
||||||
|
if (not offset2 or
|
||||||
|
fdata.find(fdata[offset2 - 1] + objheader, offset2) > 0):
|
||||||
|
source.warning("Expected indirect object '%s'", objheader)
|
||||||
|
return None
|
||||||
|
source.warning("Indirect object %s found at incorrect "
|
||||||
|
"offset %d (expected offset %d)",
|
||||||
|
objheader, offset2, offset)
|
||||||
|
source.floc = offset2 + len(objheader)
|
||||||
|
|
||||||
|
# Read the object, and call special code if it starts
|
||||||
|
# an array or dictionary
|
||||||
|
obj = source.next()
|
||||||
|
func = self.special.get(obj)
|
||||||
|
if func is not None:
|
||||||
|
obj = func(source)
|
||||||
|
|
||||||
|
self.indirect_objects[key] = obj
|
||||||
|
self.deferred_objects.remove(key)
|
||||||
|
|
||||||
|
# Mark the object as indirect, and
|
||||||
|
# just return it if it is a simple object.
|
||||||
|
obj.indirect = key
|
||||||
|
tok = source.next()
|
||||||
|
if tok == 'endobj':
|
||||||
|
return obj
|
||||||
|
|
||||||
|
# Should be a stream. Either that or it's broken.
|
||||||
|
isdict = isinstance(obj, PdfDict)
|
||||||
|
if isdict and tok == 'stream':
|
||||||
|
self.readstream(obj, self.findstream(obj, tok, source), source)
|
||||||
|
return obj
|
||||||
|
|
||||||
|
# Houston, we have a problem, but let's see if it
|
||||||
|
# is easily fixable. Leaving out a space before endobj
|
||||||
|
# is apparently an easy mistake to make on generation
|
||||||
|
# (Because it won't be noticed unless you are specifically
|
||||||
|
# generating an indirect object that doesn't end with any
|
||||||
|
# sort of delimiter.) It is so common that things like
|
||||||
|
# okular just handle it.
|
||||||
|
|
||||||
|
if isinstance(obj, PdfObject) and obj.endswith('endobj'):
|
||||||
|
source.error('No space or delimiter before endobj')
|
||||||
|
obj = PdfObject(obj[:-6])
|
||||||
|
else:
|
||||||
|
source.error("Expected 'endobj'%s token",
|
||||||
|
isdict and " or 'stream'" or '')
|
||||||
|
obj = PdfObject('')
|
||||||
|
|
||||||
|
obj.indirect = key
|
||||||
|
self.indirect_objects[key] = obj
|
||||||
|
return obj
|
||||||
|
|
||||||
|
def read_all(self):
|
||||||
|
deferred = self.deferred_objects
|
||||||
|
prev = set()
|
||||||
|
while 1:
|
||||||
|
new = deferred - prev
|
||||||
|
if not new:
|
||||||
|
break
|
||||||
|
prev |= deferred
|
||||||
|
for key in new:
|
||||||
|
self.loadindirect(key)
|
||||||
|
|
||||||
|
def decrypt_all(self):
|
||||||
|
self.read_all()
|
||||||
|
|
||||||
|
if self.crypt_filters is not None:
|
||||||
|
crypt.decrypt_objects(
|
||||||
|
self.indirect_objects.values(), self.stream_crypt_filter,
|
||||||
|
self.crypt_filters)
|
||||||
|
|
||||||
|
def uncompress(self):
|
||||||
|
self.read_all()
|
||||||
|
|
||||||
|
uncompress(self.indirect_objects.values())
|
||||||
|
|
||||||
|
def load_stream_objects(self, object_streams):
|
||||||
|
# read object streams
|
||||||
|
objs = []
|
||||||
|
for num in object_streams:
|
||||||
|
obj = self.findindirect(num, 0).real_value()
|
||||||
|
assert obj.Type == '/ObjStm'
|
||||||
|
objs.append(obj)
|
||||||
|
|
||||||
|
# read objects from stream
|
||||||
|
if objs:
|
||||||
|
# Decrypt
|
||||||
|
if self.crypt_filters is not None:
|
||||||
|
crypt.decrypt_objects(
|
||||||
|
objs, self.stream_crypt_filter, self.crypt_filters)
|
||||||
|
|
||||||
|
# Decompress
|
||||||
|
uncompress(objs)
|
||||||
|
|
||||||
|
for obj in objs:
|
||||||
|
objsource = PdfTokens(obj.stream, 0, False)
|
||||||
|
next = objsource.next
|
||||||
|
offsets = []
|
||||||
|
firstoffset = int(obj.First)
|
||||||
|
while objsource.floc < firstoffset:
|
||||||
|
offsets.append((int(next()), firstoffset + int(next())))
|
||||||
|
for num, offset in offsets:
|
||||||
|
# Read the object, and call special code if it starts
|
||||||
|
# an array or dictionary
|
||||||
|
objsource.floc = offset
|
||||||
|
sobj = next()
|
||||||
|
func = self.special.get(sobj)
|
||||||
|
if func is not None:
|
||||||
|
sobj = func(objsource)
|
||||||
|
|
||||||
|
key = (num, 0)
|
||||||
|
self.indirect_objects[key] = sobj
|
||||||
|
if key in self.deferred_objects:
|
||||||
|
self.deferred_objects.remove(key)
|
||||||
|
|
||||||
|
# Mark the object as indirect, and
|
||||||
|
# add it to the list of streams if it starts a stream
|
||||||
|
sobj.indirect = key
|
||||||
|
|
||||||
|
def findxref(self, fdata):
|
||||||
|
''' Find the cross reference section at the end of a file
|
||||||
|
'''
|
||||||
|
startloc = fdata.rfind('startxref')
|
||||||
|
if startloc < 0:
|
||||||
|
raise PdfParseError('Did not find "startxref" at end of file')
|
||||||
|
source = PdfTokens(fdata, startloc, False, self.verbose)
|
||||||
|
tok = source.next()
|
||||||
|
assert tok == 'startxref' # (We just checked this...)
|
||||||
|
tableloc = source.next_default()
|
||||||
|
if not tableloc.isdigit():
|
||||||
|
source.exception('Expected table location')
|
||||||
|
if source.next_default().rstrip().lstrip('%') != 'EOF':
|
||||||
|
source.exception('Expected %%EOF')
|
||||||
|
return startloc, PdfTokens(fdata, int(tableloc), True, self.verbose)
|
||||||
|
|
||||||
|
def parse_xref_stream(self, source, int=int, range=range,
|
||||||
|
enumerate=enumerate, islice=itertools.islice,
|
||||||
|
defaultdict=collections.defaultdict,
|
||||||
|
hexlify=binascii.hexlify):
|
||||||
|
''' Parse (one of) the cross-reference file section(s)
|
||||||
|
'''
|
||||||
|
|
||||||
|
def readint(s, lengths):
|
||||||
|
offset = 0
|
||||||
|
for length in itertools.cycle(lengths):
|
||||||
|
next = offset + length
|
||||||
|
yield int(hexlify(s[offset:next]), 16) if length else None
|
||||||
|
offset = next
|
||||||
|
|
||||||
|
setdefault = source.obj_offsets.setdefault
|
||||||
|
next = source.next
|
||||||
|
# check for xref stream object
|
||||||
|
objid = source.multiple(3)
|
||||||
|
ok = len(objid) == 3
|
||||||
|
ok = ok and objid[0].isdigit()
|
||||||
|
ok = ok and objid[1] == 'obj'
|
||||||
|
ok = ok and objid[2] == '<<'
|
||||||
|
if not ok:
|
||||||
|
source.exception('Expected xref stream start')
|
||||||
|
obj = self.readdict(source)
|
||||||
|
if obj.Type != PdfName.XRef:
|
||||||
|
source.exception('Expected dict type of /XRef')
|
||||||
|
tok = next()
|
||||||
|
self.readstream(obj, self.findstream(obj, tok, source), source, True)
|
||||||
|
old_strm = obj.stream
|
||||||
|
if not uncompress([obj], True):
|
||||||
|
source.exception('Could not decompress Xref stream')
|
||||||
|
stream = obj.stream
|
||||||
|
# Fix for issue #76 -- goofy compressed xref stream
|
||||||
|
# that is NOT ACTUALLY COMPRESSED
|
||||||
|
stream = stream if stream is not old_strm else convert_store(old_strm)
|
||||||
|
num_pairs = obj.Index or PdfArray(['0', obj.Size])
|
||||||
|
num_pairs = [int(x) for x in num_pairs]
|
||||||
|
num_pairs = zip(num_pairs[0::2], num_pairs[1::2])
|
||||||
|
entry_sizes = [int(x) for x in obj.W]
|
||||||
|
if len(entry_sizes) != 3:
|
||||||
|
source.exception('Invalid entry size')
|
||||||
|
object_streams = defaultdict(list)
|
||||||
|
get = readint(stream, entry_sizes)
|
||||||
|
for objnum, size in num_pairs:
|
||||||
|
for cnt in range(size):
|
||||||
|
xtype, p1, p2 = islice(get, 3)
|
||||||
|
if xtype in (1, None):
|
||||||
|
if p1:
|
||||||
|
setdefault((objnum, p2 or 0), p1)
|
||||||
|
elif xtype == 2:
|
||||||
|
object_streams[p1].append((objnum, p2))
|
||||||
|
objnum += 1
|
||||||
|
|
||||||
|
obj.private.object_streams = object_streams
|
||||||
|
return obj
|
||||||
|
|
||||||
|
def parse_xref_table(self, source, int=int, range=range):
|
||||||
|
''' Parse (one of) the cross-reference file section(s)
|
||||||
|
'''
|
||||||
|
setdefault = source.obj_offsets.setdefault
|
||||||
|
next = source.next
|
||||||
|
# plain xref table
|
||||||
|
start = source.floc
|
||||||
|
try:
|
||||||
|
while 1:
|
||||||
|
tok = next()
|
||||||
|
if tok == 'trailer':
|
||||||
|
return
|
||||||
|
startobj = int(tok)
|
||||||
|
for objnum in range(startobj, startobj + int(next())):
|
||||||
|
offset = int(next())
|
||||||
|
generation = int(next())
|
||||||
|
inuse = next()
|
||||||
|
if inuse == 'n':
|
||||||
|
if offset != 0:
|
||||||
|
setdefault((objnum, generation), offset)
|
||||||
|
elif inuse != 'f':
|
||||||
|
raise ValueError
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
# Table formatted incorrectly.
|
||||||
|
# See if we can figure it out anyway.
|
||||||
|
end = source.fdata.rindex('trailer', start)
|
||||||
|
table = source.fdata[start:end].splitlines()
|
||||||
|
for line in table:
|
||||||
|
tokens = line.split()
|
||||||
|
if len(tokens) == 2:
|
||||||
|
objnum = int(tokens[0])
|
||||||
|
elif len(tokens) == 3:
|
||||||
|
offset, generation, inuse = (int(tokens[0]),
|
||||||
|
int(tokens[1]), tokens[2])
|
||||||
|
if offset != 0 and inuse == 'n':
|
||||||
|
setdefault((objnum, generation), offset)
|
||||||
|
objnum += 1
|
||||||
|
elif tokens:
|
||||||
|
log.error('Invalid line in xref table: %s' %
|
||||||
|
repr(line))
|
||||||
|
raise ValueError
|
||||||
|
log.warning('Badly formatted xref table')
|
||||||
|
source.floc = end
|
||||||
|
next()
|
||||||
|
except:
|
||||||
|
source.floc = start
|
||||||
|
source.exception('Invalid table format')
|
||||||
|
|
||||||
|
def parsexref(self, source):
|
||||||
|
''' Parse (one of) the cross-reference file section(s)
|
||||||
|
'''
|
||||||
|
next = source.next
|
||||||
|
try:
|
||||||
|
tok = next()
|
||||||
|
except StopIteration:
|
||||||
|
tok = ''
|
||||||
|
if tok.isdigit():
|
||||||
|
return self.parse_xref_stream(source), True
|
||||||
|
elif tok == 'xref':
|
||||||
|
self.parse_xref_table(source)
|
||||||
|
tok = next()
|
||||||
|
if tok != '<<':
|
||||||
|
source.exception('Expected "<<" starting catalog')
|
||||||
|
return self.readdict(source), False
|
||||||
|
else:
|
||||||
|
source.exception('Expected "xref" keyword or xref stream object')
|
||||||
|
|
||||||
|
def readpages(self, node):
|
||||||
|
pagename = PdfName.Page
|
||||||
|
pagesname = PdfName.Pages
|
||||||
|
catalogname = PdfName.Catalog
|
||||||
|
typename = PdfName.Type
|
||||||
|
kidname = PdfName.Kids
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = []
|
||||||
|
stack = [node]
|
||||||
|
append = result.append
|
||||||
|
pop = stack.pop
|
||||||
|
while stack:
|
||||||
|
node = pop()
|
||||||
|
nodetype = node[typename]
|
||||||
|
if nodetype == pagename:
|
||||||
|
append(node)
|
||||||
|
elif nodetype == pagesname:
|
||||||
|
stack.extend(reversed(node[kidname]))
|
||||||
|
elif nodetype == catalogname:
|
||||||
|
stack.append(node[pagesname])
|
||||||
|
else:
|
||||||
|
log.error('Expected /Page or /Pages dictionary, got %s' %
|
||||||
|
repr(node))
|
||||||
|
return result
|
||||||
|
except (AttributeError, TypeError) as s:
|
||||||
|
log.error('Invalid page tree: %s' % s)
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _parse_encrypt_info(self, source, password, trailer):
|
||||||
|
"""Check password and initialize crypt filters."""
|
||||||
|
# Create and check password key
|
||||||
|
key = crypt.create_key(password, trailer)
|
||||||
|
|
||||||
|
if not crypt.check_user_password(key, trailer):
|
||||||
|
source.warning('User password does not validate')
|
||||||
|
|
||||||
|
# Create default crypt filters
|
||||||
|
private = self.private
|
||||||
|
crypt_filters = self.crypt_filters
|
||||||
|
version = int(trailer.Encrypt.V or 0)
|
||||||
|
if version in (1, 2):
|
||||||
|
crypt_filter = crypt.RC4CryptFilter(key)
|
||||||
|
private.stream_crypt_filter = crypt_filter
|
||||||
|
private.string_crypt_filter = crypt_filter
|
||||||
|
elif version == 4:
|
||||||
|
if PdfName.CF in trailer.Encrypt:
|
||||||
|
for name, params in iteritems(trailer.Encrypt.CF):
|
||||||
|
if name == PdfName.Identity:
|
||||||
|
continue
|
||||||
|
|
||||||
|
cfm = params.CFM
|
||||||
|
if cfm == PdfName.AESV2:
|
||||||
|
crypt_filters[name] = crypt.AESCryptFilter(key)
|
||||||
|
elif cfm == PdfName.V2:
|
||||||
|
crypt_filters[name] = crypt.RC4CryptFilter(key)
|
||||||
|
else:
|
||||||
|
source.warning(
|
||||||
|
'Unsupported crypt filter: {}, {}'.format(
|
||||||
|
name, cfm))
|
||||||
|
|
||||||
|
# Read default stream filter
|
||||||
|
if PdfName.StmF in trailer.Encrypt:
|
||||||
|
name = trailer.Encrypt.StmF
|
||||||
|
if name in crypt_filters:
|
||||||
|
private.stream_crypt_filter = crypt_filters[name]
|
||||||
|
else:
|
||||||
|
source.warning(
|
||||||
|
'Invalid crypt filter name in /StmF:'
|
||||||
|
' {}'.format(name))
|
||||||
|
|
||||||
|
# Read default string filter
|
||||||
|
if PdfName.StrF in trailer.Encrypt:
|
||||||
|
name = trailer.Encrypt.StrF
|
||||||
|
if name in crypt_filters:
|
||||||
|
private.string_crypt_filter = crypt_filters[name]
|
||||||
|
else:
|
||||||
|
source.warning(
|
||||||
|
'Invalid crypt filter name in /StrF:'
|
||||||
|
' {}'.format(name))
|
||||||
|
else:
|
||||||
|
source.warning(
|
||||||
|
'Unsupported Encrypt version: {}'.format(version))
|
||||||
|
|
||||||
|
def __init__(self, fname=None, fdata=None, decompress=False,
|
||||||
|
decrypt=False, password='', disable_gc=True, verbose=True):
|
||||||
|
self.private.verbose = verbose
|
||||||
|
|
||||||
|
# Runs a lot faster with GC off.
|
||||||
|
disable_gc = disable_gc and gc.isenabled()
|
||||||
|
if disable_gc:
|
||||||
|
gc.disable()
|
||||||
|
|
||||||
|
try:
|
||||||
|
if fname is not None:
|
||||||
|
assert fdata is None
|
||||||
|
# Allow reading preexisting streams like pyPdf
|
||||||
|
if hasattr(fname, 'read'):
|
||||||
|
fdata = fname.read()
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
f = open(fname, 'rb')
|
||||||
|
fdata = f.read()
|
||||||
|
f.close()
|
||||||
|
except IOError:
|
||||||
|
raise PdfParseError('Could not read PDF file %s' %
|
||||||
|
fname)
|
||||||
|
|
||||||
|
assert fdata is not None
|
||||||
|
fdata = convert_load(fdata)
|
||||||
|
|
||||||
|
if not fdata.startswith('%PDF-'):
|
||||||
|
startloc = fdata.find('%PDF-')
|
||||||
|
if startloc >= 0:
|
||||||
|
log.warning('PDF header not at beginning of file')
|
||||||
|
else:
|
||||||
|
lines = fdata.lstrip().splitlines()
|
||||||
|
if not lines:
|
||||||
|
raise PdfParseError('Empty PDF file!')
|
||||||
|
raise PdfParseError('Invalid PDF header: %s' %
|
||||||
|
repr(lines[0]))
|
||||||
|
|
||||||
|
self.private.version = fdata[5:8]
|
||||||
|
|
||||||
|
endloc = fdata.rfind('%EOF')
|
||||||
|
if endloc < 0:
|
||||||
|
raise PdfParseError('EOF mark not found: %s' %
|
||||||
|
repr(fdata[-20:]))
|
||||||
|
endloc += 6
|
||||||
|
junk = fdata[endloc:]
|
||||||
|
fdata = fdata[:endloc]
|
||||||
|
if junk.rstrip('\00').strip():
|
||||||
|
log.warning('Extra data at end of file')
|
||||||
|
|
||||||
|
private = self.private
|
||||||
|
private.indirect_objects = {}
|
||||||
|
private.deferred_objects = set()
|
||||||
|
private.special = {'<<': self.readdict,
|
||||||
|
'[': self.readarray,
|
||||||
|
'endobj': self.empty_obj,
|
||||||
|
}
|
||||||
|
for tok in r'\ ( ) < > { } ] >> %'.split():
|
||||||
|
self.special[tok] = self.badtoken
|
||||||
|
|
||||||
|
startloc, source = self.findxref(fdata)
|
||||||
|
private.source = source
|
||||||
|
|
||||||
|
# Find all the xref tables/streams, and
|
||||||
|
# then deal with them backwards.
|
||||||
|
xref_list = []
|
||||||
|
while 1:
|
||||||
|
source.obj_offsets = {}
|
||||||
|
trailer, is_stream = self.parsexref(source)
|
||||||
|
prev = trailer.Prev
|
||||||
|
if prev is None:
|
||||||
|
token = source.next()
|
||||||
|
if token != 'startxref' and not xref_list:
|
||||||
|
source.warning('Expected "startxref" '
|
||||||
|
'at end of xref table')
|
||||||
|
break
|
||||||
|
xref_list.append((source.obj_offsets, trailer, is_stream))
|
||||||
|
source.floc = int(prev)
|
||||||
|
|
||||||
|
# Handle document encryption
|
||||||
|
private.crypt_filters = None
|
||||||
|
if decrypt and PdfName.Encrypt in trailer:
|
||||||
|
identity_filter = crypt.IdentityCryptFilter()
|
||||||
|
crypt_filters = {
|
||||||
|
PdfName.Identity: identity_filter
|
||||||
|
}
|
||||||
|
private.crypt_filters = crypt_filters
|
||||||
|
private.stream_crypt_filter = identity_filter
|
||||||
|
private.string_crypt_filter = identity_filter
|
||||||
|
|
||||||
|
if not crypt.HAS_CRYPTO:
|
||||||
|
raise PdfParseError(
|
||||||
|
'Install PyCrypto to enable encryption support')
|
||||||
|
|
||||||
|
self._parse_encrypt_info(source, password, trailer)
|
||||||
|
|
||||||
|
if is_stream:
|
||||||
|
self.load_stream_objects(trailer.object_streams)
|
||||||
|
|
||||||
|
while xref_list:
|
||||||
|
later_offsets, later_trailer, is_stream = xref_list.pop()
|
||||||
|
source.obj_offsets.update(later_offsets)
|
||||||
|
if is_stream:
|
||||||
|
trailer.update(later_trailer)
|
||||||
|
self.load_stream_objects(later_trailer.object_streams)
|
||||||
|
else:
|
||||||
|
trailer = later_trailer
|
||||||
|
|
||||||
|
trailer.Prev = None
|
||||||
|
|
||||||
|
if (trailer.Version and
|
||||||
|
float(trailer.Version) > float(self.version)):
|
||||||
|
self.private.version = trailer.Version
|
||||||
|
|
||||||
|
if decrypt:
|
||||||
|
self.decrypt_all()
|
||||||
|
trailer.Encrypt = None
|
||||||
|
|
||||||
|
if is_stream:
|
||||||
|
self.Root = trailer.Root
|
||||||
|
self.Info = trailer.Info
|
||||||
|
self.ID = trailer.ID
|
||||||
|
self.Size = trailer.Size
|
||||||
|
self.Encrypt = trailer.Encrypt
|
||||||
|
else:
|
||||||
|
self.update(trailer)
|
||||||
|
|
||||||
|
# self.read_all_indirect(source)
|
||||||
|
private.pages = self.readpages(self.Root)
|
||||||
|
if decompress:
|
||||||
|
self.uncompress()
|
||||||
|
|
||||||
|
# For compatibility with pyPdf
|
||||||
|
private.numPages = len(self.pages)
|
||||||
|
finally:
|
||||||
|
if disable_gc:
|
||||||
|
gc.enable()
|
||||||
|
|
||||||
|
# For compatibility with pyPdf
|
||||||
|
def getPage(self, pagenum):
|
||||||
|
return self.pages[pagenum]
|
|
@ -0,0 +1,385 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
'''
|
||||||
|
The PdfWriter class writes an entire PDF file out to disk.
|
||||||
|
|
||||||
|
The writing process is not at all optimized or organized.
|
||||||
|
|
||||||
|
An instance of the PdfWriter class has two methods:
|
||||||
|
addpage(page)
|
||||||
|
and
|
||||||
|
write(fname)
|
||||||
|
|
||||||
|
addpage() assumes that the pages are part of a valid
|
||||||
|
tree/forest of PDF objects.
|
||||||
|
'''
|
||||||
|
import gc
|
||||||
|
|
||||||
|
from .objects import (PdfName, PdfArray, PdfDict, IndirectPdfDict,
|
||||||
|
PdfObject, PdfString)
|
||||||
|
from .compress import compress as do_compress
|
||||||
|
from .errors import PdfOutputError, log
|
||||||
|
from .py23_diffs import iteritems, convert_store
|
||||||
|
|
||||||
|
NullObject = PdfObject('null')
|
||||||
|
NullObject.indirect = True
|
||||||
|
NullObject.Type = 'Null object'
|
||||||
|
|
||||||
|
|
||||||
|
def user_fmt(obj, isinstance=isinstance, float=float, str=str,
|
||||||
|
basestring=(type(u''), type(b'')), encode=PdfString.encode):
|
||||||
|
''' This function may be replaced by the user for
|
||||||
|
specialized formatting requirements.
|
||||||
|
'''
|
||||||
|
|
||||||
|
if isinstance(obj, basestring):
|
||||||
|
return encode(obj)
|
||||||
|
|
||||||
|
# PDFs don't handle exponent notation
|
||||||
|
if isinstance(obj, float):
|
||||||
|
return ('%.9f' % obj).rstrip('0').rstrip('.')
|
||||||
|
|
||||||
|
return str(obj)
|
||||||
|
|
||||||
|
|
||||||
|
def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
|
||||||
|
user_fmt=user_fmt, do_compress=do_compress,
|
||||||
|
convert_store=convert_store, iteritems=iteritems,
|
||||||
|
id=id, isinstance=isinstance, getattr=getattr, len=len,
|
||||||
|
sum=sum, set=set, str=str, hasattr=hasattr, repr=repr,
|
||||||
|
enumerate=enumerate, list=list, dict=dict, tuple=tuple,
|
||||||
|
PdfArray=PdfArray, PdfDict=PdfDict, PdfObject=PdfObject):
|
||||||
|
''' FormatObjects performs the actual formatting and disk write.
|
||||||
|
Should be a class, was a class, turned into nested functions
|
||||||
|
for performace (to reduce attribute lookups).
|
||||||
|
'''
|
||||||
|
|
||||||
|
def f_write(s):
|
||||||
|
f.write(convert_store(s))
|
||||||
|
|
||||||
|
def add(obj):
|
||||||
|
''' Add an object to our list, if it's an indirect
|
||||||
|
object. Just format it if not.
|
||||||
|
'''
|
||||||
|
# Can't hash dicts, so just hash the object ID
|
||||||
|
objid = id(obj)
|
||||||
|
|
||||||
|
# Automatically set stream objects to indirect
|
||||||
|
if isinstance(obj, PdfDict):
|
||||||
|
indirect = obj.indirect or (obj.stream is not None)
|
||||||
|
else:
|
||||||
|
indirect = getattr(obj, 'indirect', False)
|
||||||
|
|
||||||
|
if not indirect:
|
||||||
|
if objid in visited:
|
||||||
|
log.warning('Replicating direct %s object, '
|
||||||
|
'should be indirect for optimal file size' %
|
||||||
|
type(obj))
|
||||||
|
obj = type(obj)(obj)
|
||||||
|
objid = id(obj)
|
||||||
|
visiting(objid)
|
||||||
|
result = format_obj(obj)
|
||||||
|
leaving(objid)
|
||||||
|
return result
|
||||||
|
|
||||||
|
objnum = indirect_dict_get(objid)
|
||||||
|
|
||||||
|
# If we haven't seen the object yet, we need to
|
||||||
|
# add it to the indirect object list.
|
||||||
|
if objnum is None:
|
||||||
|
swapped = swapobj(objid)
|
||||||
|
if swapped is not None:
|
||||||
|
old_id = objid
|
||||||
|
obj = swapped
|
||||||
|
objid = id(obj)
|
||||||
|
objnum = indirect_dict_get(objid)
|
||||||
|
if objnum is not None:
|
||||||
|
indirect_dict[old_id] = objnum
|
||||||
|
return '%s 0 R' % objnum
|
||||||
|
objnum = len(objlist) + 1
|
||||||
|
objlist_append(None)
|
||||||
|
indirect_dict[objid] = objnum
|
||||||
|
deferred.append((objnum - 1, obj))
|
||||||
|
return '%s 0 R' % objnum
|
||||||
|
|
||||||
|
def format_array(myarray, formatter):
|
||||||
|
# Format array data into semi-readable ASCII
|
||||||
|
if sum([len(x) for x in myarray]) <= 70:
|
||||||
|
return formatter % space_join(myarray)
|
||||||
|
return format_big(myarray, formatter)
|
||||||
|
|
||||||
|
def format_big(myarray, formatter):
|
||||||
|
bigarray = []
|
||||||
|
count = 1000000
|
||||||
|
for x in myarray:
|
||||||
|
lenx = len(x) + 1
|
||||||
|
count += lenx
|
||||||
|
if count > 71:
|
||||||
|
subarray = []
|
||||||
|
bigarray.append(subarray)
|
||||||
|
count = lenx
|
||||||
|
subarray.append(x)
|
||||||
|
return formatter % lf_join([space_join(x) for x in bigarray])
|
||||||
|
|
||||||
|
def format_obj(obj):
|
||||||
|
''' format PDF object data into semi-readable ASCII.
|
||||||
|
May mutually recurse with add() -- add() will
|
||||||
|
return references for indirect objects, and add
|
||||||
|
the indirect object to the list.
|
||||||
|
'''
|
||||||
|
while 1:
|
||||||
|
if isinstance(obj, (list, dict, tuple)):
|
||||||
|
if isinstance(obj, PdfArray):
|
||||||
|
myarray = [add(x) for x in obj]
|
||||||
|
return format_array(myarray, '[%s]')
|
||||||
|
elif isinstance(obj, PdfDict):
|
||||||
|
if compress and obj.stream:
|
||||||
|
do_compress([obj])
|
||||||
|
pairs = sorted((getattr(x, 'encoded', None) or x, y)
|
||||||
|
for (x, y) in obj.iteritems())
|
||||||
|
myarray = []
|
||||||
|
for key, value in pairs:
|
||||||
|
myarray.append(key)
|
||||||
|
myarray.append(add(value))
|
||||||
|
result = format_array(myarray, '<<%s>>')
|
||||||
|
stream = obj.stream
|
||||||
|
if stream is not None:
|
||||||
|
result = ('%s\nstream\n%s\nendstream' %
|
||||||
|
(result, stream))
|
||||||
|
return result
|
||||||
|
obj = (PdfArray, PdfDict)[isinstance(obj, dict)](obj)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# We assume that an object with an indirect
|
||||||
|
# attribute knows how to represent itself to us.
|
||||||
|
if hasattr(obj, 'indirect'):
|
||||||
|
return str(getattr(obj, 'encoded', None) or obj)
|
||||||
|
return user_fmt(obj)
|
||||||
|
|
||||||
|
def format_deferred():
|
||||||
|
while deferred:
|
||||||
|
index, obj = deferred.pop()
|
||||||
|
objlist[index] = format_obj(obj)
|
||||||
|
|
||||||
|
indirect_dict = {}
|
||||||
|
indirect_dict_get = indirect_dict.get
|
||||||
|
objlist = []
|
||||||
|
objlist_append = objlist.append
|
||||||
|
visited = set()
|
||||||
|
visiting = visited.add
|
||||||
|
leaving = visited.remove
|
||||||
|
space_join = ' '.join
|
||||||
|
lf_join = '\n '.join
|
||||||
|
|
||||||
|
deferred = []
|
||||||
|
|
||||||
|
# Don't reference old catalog or pages objects --
|
||||||
|
# swap references to new ones.
|
||||||
|
type_remap = {PdfName.Catalog: trailer.Root,
|
||||||
|
PdfName.Pages: trailer.Root.Pages, None: trailer}.get
|
||||||
|
swapobj = [(objid, type_remap(obj.Type) if new_obj is None else new_obj)
|
||||||
|
for objid, (obj, new_obj) in iteritems(killobj)]
|
||||||
|
swapobj = dict((objid, obj is None and NullObject or obj)
|
||||||
|
for objid, obj in swapobj).get
|
||||||
|
|
||||||
|
for objid in killobj:
|
||||||
|
assert swapobj(objid) is not None
|
||||||
|
|
||||||
|
# The first format of trailer gets all the information,
|
||||||
|
# but we throw away the actual trailer formatting.
|
||||||
|
format_obj(trailer)
|
||||||
|
# Keep formatting until we're done.
|
||||||
|
# (Used to recurse inside format_obj for this, but
|
||||||
|
# hit system limit.)
|
||||||
|
format_deferred()
|
||||||
|
# Now we know the size, so we update the trailer dict
|
||||||
|
# and get the formatted data.
|
||||||
|
trailer.Size = PdfObject(len(objlist) + 1)
|
||||||
|
trailer = format_obj(trailer)
|
||||||
|
|
||||||
|
# Now we have all the pieces to write out to the file.
|
||||||
|
# Keep careful track of the counts while we do it so
|
||||||
|
# we can correctly build the cross-reference.
|
||||||
|
|
||||||
|
header = '%%PDF-%s\n%%\xe2\xe3\xcf\xd3\n' % version
|
||||||
|
f_write(header)
|
||||||
|
offset = len(header)
|
||||||
|
offsets = [(0, 65535, 'f')]
|
||||||
|
offsets_append = offsets.append
|
||||||
|
|
||||||
|
for i, x in enumerate(objlist):
|
||||||
|
objstr = '%s 0 obj\n%s\nendobj\n' % (i + 1, x)
|
||||||
|
offsets_append((offset, 0, 'n'))
|
||||||
|
offset += len(objstr)
|
||||||
|
f_write(objstr)
|
||||||
|
|
||||||
|
f_write('xref\n0 %s\n' % len(offsets))
|
||||||
|
for x in offsets:
|
||||||
|
f_write('%010d %05d %s\r\n' % x)
|
||||||
|
f_write('trailer\n\n%s\nstartxref\n%s\n%%%%EOF\n' % (trailer, offset))
|
||||||
|
|
||||||
|
|
||||||
|
class PdfWriter(object):
|
||||||
|
|
||||||
|
_trailer = None
|
||||||
|
canonicalize = False
|
||||||
|
fname = None
|
||||||
|
|
||||||
|
def __init__(self, fname=None, version='1.3', compress=False, **kwargs):
|
||||||
|
"""
|
||||||
|
Parameters:
|
||||||
|
fname -- Output file name, or file-like binary object
|
||||||
|
with a write method
|
||||||
|
version -- PDF version to target. Currently only 1.3
|
||||||
|
supported.
|
||||||
|
compress -- True to do compression on output. Currently
|
||||||
|
compresses stream objects.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Legacy support: fname is new, was added in front
|
||||||
|
if fname is not None:
|
||||||
|
try:
|
||||||
|
float(fname)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if version != '1.3':
|
||||||
|
assert compress == False
|
||||||
|
compress = version
|
||||||
|
version = fname
|
||||||
|
fname = None
|
||||||
|
|
||||||
|
self.fname = fname
|
||||||
|
self.version = version
|
||||||
|
self.compress = compress
|
||||||
|
|
||||||
|
if kwargs:
|
||||||
|
for name, value in iteritems(kwargs):
|
||||||
|
if name not in self.replaceable:
|
||||||
|
raise ValueError("Cannot set attribute %s "
|
||||||
|
"on PdfWriter instance" % name)
|
||||||
|
setattr(self, name, value)
|
||||||
|
|
||||||
|
self.pagearray = PdfArray()
|
||||||
|
self.killobj = {}
|
||||||
|
|
||||||
|
def addpage(self, page):
|
||||||
|
self._trailer = None
|
||||||
|
if page.Type != PdfName.Page:
|
||||||
|
raise PdfOutputError('Bad /Type: Expected %s, found %s'
|
||||||
|
% (PdfName.Page, page.Type))
|
||||||
|
inheritable = page.inheritable # searches for resources
|
||||||
|
self.pagearray.append(
|
||||||
|
IndirectPdfDict(
|
||||||
|
page,
|
||||||
|
Resources=inheritable.Resources,
|
||||||
|
MediaBox=inheritable.MediaBox,
|
||||||
|
CropBox=inheritable.CropBox,
|
||||||
|
Rotate=inheritable.Rotate,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add parents in the hierarchy to objects we
|
||||||
|
# don't want to output
|
||||||
|
killobj = self.killobj
|
||||||
|
obj, new_obj = page, self.pagearray[-1]
|
||||||
|
while obj is not None:
|
||||||
|
objid = id(obj)
|
||||||
|
if objid in killobj:
|
||||||
|
break
|
||||||
|
killobj[objid] = obj, new_obj
|
||||||
|
obj = obj.Parent
|
||||||
|
new_obj = None
|
||||||
|
return self
|
||||||
|
|
||||||
|
addPage = addpage # for compatibility with pyPdf
|
||||||
|
|
||||||
|
def addpages(self, pagelist):
|
||||||
|
for page in pagelist:
|
||||||
|
self.addpage(page)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def _get_trailer(self):
|
||||||
|
trailer = self._trailer
|
||||||
|
if trailer is not None:
|
||||||
|
return trailer
|
||||||
|
|
||||||
|
if self.canonicalize:
|
||||||
|
self.make_canonical()
|
||||||
|
|
||||||
|
# Create the basic object structure of the PDF file
|
||||||
|
trailer = PdfDict(
|
||||||
|
Root=IndirectPdfDict(
|
||||||
|
Type=PdfName.Catalog,
|
||||||
|
Pages=IndirectPdfDict(
|
||||||
|
Type=PdfName.Pages,
|
||||||
|
Count=PdfObject(len(self.pagearray)),
|
||||||
|
Kids=self.pagearray
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# Make all the pages point back to the page dictionary and
|
||||||
|
# ensure they are indirect references
|
||||||
|
pagedict = trailer.Root.Pages
|
||||||
|
for page in pagedict.Kids:
|
||||||
|
page.Parent = pagedict
|
||||||
|
page.indirect = True
|
||||||
|
self._trailer = trailer
|
||||||
|
return trailer
|
||||||
|
|
||||||
|
def _set_trailer(self, trailer):
|
||||||
|
self._trailer = trailer
|
||||||
|
|
||||||
|
trailer = property(_get_trailer, _set_trailer)
|
||||||
|
|
||||||
|
def write(self, fname=None, trailer=None, user_fmt=user_fmt,
|
||||||
|
disable_gc=True):
|
||||||
|
|
||||||
|
trailer = trailer or self.trailer
|
||||||
|
|
||||||
|
# Support fname for legacy applications
|
||||||
|
if (fname is not None) == (self.fname is not None):
|
||||||
|
raise PdfOutputError(
|
||||||
|
"PdfWriter fname must be specified exactly once")
|
||||||
|
|
||||||
|
fname = fname or self.fname
|
||||||
|
|
||||||
|
# Dump the data. We either have a filename or a preexisting
|
||||||
|
# file object.
|
||||||
|
preexisting = hasattr(fname, 'write')
|
||||||
|
f = preexisting and fname or open(fname, 'wb')
|
||||||
|
if disable_gc:
|
||||||
|
gc.disable()
|
||||||
|
|
||||||
|
try:
|
||||||
|
FormatObjects(f, trailer, self.version, self.compress,
|
||||||
|
self.killobj, user_fmt=user_fmt)
|
||||||
|
finally:
|
||||||
|
if not preexisting:
|
||||||
|
f.close()
|
||||||
|
if disable_gc:
|
||||||
|
gc.enable()
|
||||||
|
|
||||||
|
def make_canonical(self):
|
||||||
|
''' Canonicalizes a PDF. Assumes everything
|
||||||
|
is a Pdf object already.
|
||||||
|
'''
|
||||||
|
visited = set()
|
||||||
|
workitems = list(self.pagearray)
|
||||||
|
while workitems:
|
||||||
|
obj = workitems.pop()
|
||||||
|
objid = id(obj)
|
||||||
|
if objid in visited:
|
||||||
|
continue
|
||||||
|
visited.add(objid)
|
||||||
|
obj.indirect = False
|
||||||
|
if isinstance(obj, (PdfArray, PdfDict)):
|
||||||
|
obj.indirect = True
|
||||||
|
if isinstance(obj, PdfArray):
|
||||||
|
workitems += obj
|
||||||
|
else:
|
||||||
|
workitems += obj.values()
|
||||||
|
|
||||||
|
replaceable = set(vars())
|
|
@ -0,0 +1,53 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
# Deal with Python2/3 differences
|
||||||
|
|
||||||
|
try:
|
||||||
|
import zlib
|
||||||
|
except ImportError:
|
||||||
|
zlib = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
unicode = unicode
|
||||||
|
except NameError:
|
||||||
|
|
||||||
|
def convert_load(s):
|
||||||
|
if isinstance(s, bytes):
|
||||||
|
return s.decode('Latin-1')
|
||||||
|
return s
|
||||||
|
|
||||||
|
def convert_store(s):
|
||||||
|
return s.encode('Latin-1')
|
||||||
|
|
||||||
|
def from_array(a):
|
||||||
|
return a.tobytes()
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
def convert_load(s):
|
||||||
|
return s
|
||||||
|
|
||||||
|
def convert_store(s):
|
||||||
|
return s
|
||||||
|
|
||||||
|
def from_array(a):
|
||||||
|
return a.tostring()
|
||||||
|
|
||||||
|
nextattr, = (x for x in dir(iter([])) if 'next' in x)
|
||||||
|
|
||||||
|
try:
|
||||||
|
iteritems = dict.iteritems
|
||||||
|
except AttributeError:
|
||||||
|
iteritems = dict.items
|
||||||
|
|
||||||
|
try:
|
||||||
|
xrange = xrange
|
||||||
|
except NameError:
|
||||||
|
xrange = range
|
||||||
|
|
||||||
|
try:
|
||||||
|
intern = intern
|
||||||
|
except NameError:
|
||||||
|
from sys import intern
|
|
@ -0,0 +1,229 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
'''
|
||||||
|
A tokenizer for PDF streams.
|
||||||
|
|
||||||
|
In general, documentation used was "PDF reference",
|
||||||
|
sixth edition, for PDF version 1.7, dated November 2006.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
import itertools
|
||||||
|
from .objects import PdfString, PdfObject
|
||||||
|
from .objects.pdfname import BasePdfName
|
||||||
|
from .errors import log, PdfParseError
|
||||||
|
from .py23_diffs import nextattr, intern
|
||||||
|
|
||||||
|
|
||||||
|
def linepos(fdata, loc):
|
||||||
|
line = fdata.count('\n', 0, loc) + 1
|
||||||
|
line += fdata.count('\r', 0, loc) - fdata.count('\r\n', 0, loc)
|
||||||
|
col = loc - max(fdata.rfind('\n', 0, loc), fdata.rfind('\r', 0, loc))
|
||||||
|
return line, col
|
||||||
|
|
||||||
|
|
||||||
|
class PdfTokens(object):
|
||||||
|
|
||||||
|
# Table 3.1, page 50 of reference, defines whitespace
|
||||||
|
eol = '\n\r'
|
||||||
|
whitespace = '\x00 \t\f' + eol
|
||||||
|
|
||||||
|
# Text on page 50 defines delimiter characters
|
||||||
|
# Escape the ]
|
||||||
|
delimiters = r'()<>{}[\]/%'
|
||||||
|
|
||||||
|
# "normal" stuff is all but delimiters or whitespace.
|
||||||
|
|
||||||
|
p_normal = r'(?:[^\\%s%s]+|\\[^%s])+' % (whitespace, delimiters,
|
||||||
|
whitespace)
|
||||||
|
|
||||||
|
p_comment = r'\%%[^%s]*' % eol
|
||||||
|
|
||||||
|
# This will get the bulk of literal strings.
|
||||||
|
p_literal_string = r'\((?:[^\\()]+|\\.)*[()]?'
|
||||||
|
|
||||||
|
# This will get more pieces of literal strings
|
||||||
|
# (Don't ask me why, but it hangs without the trailing ?.)
|
||||||
|
p_literal_string_extend = r'(?:[^\\()]+|\\.)*[()]?'
|
||||||
|
|
||||||
|
# A hex string. This one's easy.
|
||||||
|
p_hex_string = r'\<[%s0-9A-Fa-f]*\>' % whitespace
|
||||||
|
|
||||||
|
p_dictdelim = r'\<\<|\>\>'
|
||||||
|
p_name = r'/[^%s%s]*' % (delimiters, whitespace)
|
||||||
|
|
||||||
|
p_catchall = '[^%s]' % whitespace
|
||||||
|
|
||||||
|
pattern = '|'.join([p_normal, p_name, p_hex_string, p_dictdelim,
|
||||||
|
p_literal_string, p_comment, p_catchall])
|
||||||
|
findtok = re.compile('(%s)[%s]*' % (pattern, whitespace),
|
||||||
|
re.DOTALL).finditer
|
||||||
|
findparen = re.compile('(%s)[%s]*' % (p_literal_string_extend,
|
||||||
|
whitespace), re.DOTALL).finditer
|
||||||
|
|
||||||
|
def _gettoks(self, startloc, intern=intern,
|
||||||
|
delimiters=delimiters, findtok=findtok,
|
||||||
|
findparen=findparen, PdfString=PdfString,
|
||||||
|
PdfObject=PdfObject, BasePdfName=BasePdfName):
|
||||||
|
''' Given a source data string and a location inside it,
|
||||||
|
gettoks generates tokens. Each token is a tuple of the form:
|
||||||
|
<starting file loc>, <ending file loc>, <token string>
|
||||||
|
The ending file loc is past any trailing whitespace.
|
||||||
|
|
||||||
|
The main complication here is the literal strings, which
|
||||||
|
can contain nested parentheses. In order to cope with these
|
||||||
|
we can discard the current iterator and loop back to the
|
||||||
|
top to get a fresh one.
|
||||||
|
|
||||||
|
We could use re.search instead of re.finditer, but that's slower.
|
||||||
|
'''
|
||||||
|
fdata = self.fdata
|
||||||
|
current = self.current = [(startloc, startloc)]
|
||||||
|
cache = {}
|
||||||
|
get_cache = cache.get
|
||||||
|
while 1:
|
||||||
|
for match in findtok(fdata, current[0][1]):
|
||||||
|
current[0] = tokspan = match.span()
|
||||||
|
token = match.group(1)
|
||||||
|
firstch = token[0]
|
||||||
|
toktype = intern
|
||||||
|
if firstch not in delimiters:
|
||||||
|
toktype = PdfObject
|
||||||
|
elif firstch in '/<(%':
|
||||||
|
if firstch == '/':
|
||||||
|
# PDF Name
|
||||||
|
toktype = BasePdfName
|
||||||
|
elif firstch == '<':
|
||||||
|
# << dict delim, or < hex string >
|
||||||
|
if token[1:2] != '<':
|
||||||
|
toktype = PdfString
|
||||||
|
elif firstch == '(':
|
||||||
|
# Literal string
|
||||||
|
# It's probably simple, but maybe not
|
||||||
|
# Nested parentheses are a bear, and if
|
||||||
|
# they are present, we exit the for loop
|
||||||
|
# and get back in with a new starting location.
|
||||||
|
ends = None # For broken strings
|
||||||
|
if fdata[match.end(1) - 1] != ')':
|
||||||
|
nest = 2
|
||||||
|
m_start, loc = tokspan
|
||||||
|
for match in findparen(fdata, loc):
|
||||||
|
loc = match.end(1)
|
||||||
|
ending = fdata[loc - 1] == ')'
|
||||||
|
nest += 1 - ending * 2
|
||||||
|
if not nest:
|
||||||
|
break
|
||||||
|
if ending and ends is None:
|
||||||
|
ends = loc, match.end(), nest
|
||||||
|
token = fdata[m_start:loc]
|
||||||
|
current[0] = m_start, match.end()
|
||||||
|
if nest:
|
||||||
|
# There is one possible recoverable error
|
||||||
|
# seen in the wild -- some stupid generators
|
||||||
|
# don't escape (. If this happens, just
|
||||||
|
# terminate on first unescaped ). The string
|
||||||
|
# won't be quite right, but that's a science
|
||||||
|
# fair project for another time.
|
||||||
|
(self.error, self.exception)[not ends](
|
||||||
|
'Unterminated literal string')
|
||||||
|
loc, ends, nest = ends
|
||||||
|
token = fdata[m_start:loc] + ')' * nest
|
||||||
|
current[0] = m_start, ends
|
||||||
|
toktype = PdfString
|
||||||
|
elif firstch == '%':
|
||||||
|
# Comment
|
||||||
|
if self.strip_comments:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
self.exception(('Tokenizer logic incorrect -- '
|
||||||
|
'should never get here'))
|
||||||
|
|
||||||
|
newtok = get_cache(token)
|
||||||
|
if newtok is None:
|
||||||
|
newtok = cache[token] = toktype(token)
|
||||||
|
yield newtok
|
||||||
|
if current[0] is not tokspan:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
if self.strip_comments:
|
||||||
|
break
|
||||||
|
raise StopIteration
|
||||||
|
|
||||||
|
def __init__(self, fdata, startloc=0, strip_comments=True, verbose=True):
|
||||||
|
self.fdata = fdata
|
||||||
|
self.strip_comments = strip_comments
|
||||||
|
self.iterator = iterator = self._gettoks(startloc)
|
||||||
|
self.msgs_dumped = None if verbose else set()
|
||||||
|
self.next = getattr(iterator, nextattr)
|
||||||
|
self.current = [(startloc, startloc)]
|
||||||
|
|
||||||
|
def setstart(self, startloc):
|
||||||
|
''' Change the starting location.
|
||||||
|
'''
|
||||||
|
current = self.current
|
||||||
|
if startloc != current[0][1]:
|
||||||
|
current[0] = startloc, startloc
|
||||||
|
|
||||||
|
def floc(self):
|
||||||
|
''' Return the current file position
|
||||||
|
(where the next token will be retrieved)
|
||||||
|
'''
|
||||||
|
return self.current[0][1]
|
||||||
|
floc = property(floc, setstart)
|
||||||
|
|
||||||
|
def tokstart(self):
|
||||||
|
''' Return the file position of the most
|
||||||
|
recently retrieved token.
|
||||||
|
'''
|
||||||
|
return self.current[0][0]
|
||||||
|
tokstart = property(tokstart, setstart)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return self.iterator
|
||||||
|
|
||||||
|
def multiple(self, count, islice=itertools.islice, list=list):
|
||||||
|
''' Retrieve multiple tokens
|
||||||
|
'''
|
||||||
|
return list(islice(self, count))
|
||||||
|
|
||||||
|
def next_default(self, default='nope'):
|
||||||
|
for result in self:
|
||||||
|
return result
|
||||||
|
return default
|
||||||
|
|
||||||
|
def msg(self, msg, *arg):
|
||||||
|
dumped = self.msgs_dumped
|
||||||
|
if dumped is not None:
|
||||||
|
if msg in dumped:
|
||||||
|
return
|
||||||
|
dumped.add(msg)
|
||||||
|
if arg:
|
||||||
|
msg %= arg
|
||||||
|
fdata = self.fdata
|
||||||
|
begin, end = self.current[0]
|
||||||
|
if begin >= len(fdata):
|
||||||
|
return '%s (filepos %s past EOF %s)' % (msg, begin, len(fdata))
|
||||||
|
line, col = linepos(fdata, begin)
|
||||||
|
if end > begin:
|
||||||
|
tok = fdata[begin:end].rstrip()
|
||||||
|
if len(tok) > 30:
|
||||||
|
tok = tok[:26] + ' ...'
|
||||||
|
return ('%s (line=%d, col=%d, token=%s)' %
|
||||||
|
(msg, line, col, repr(tok)))
|
||||||
|
return '%s (line=%d, col=%d)' % (msg, line, col)
|
||||||
|
|
||||||
|
def warning(self, *arg):
|
||||||
|
s = self.msg(*arg)
|
||||||
|
if s:
|
||||||
|
log.warning(s)
|
||||||
|
|
||||||
|
def error(self, *arg):
|
||||||
|
s = self.msg(*arg)
|
||||||
|
if s:
|
||||||
|
log.error(s)
|
||||||
|
|
||||||
|
def exception(self, *arg):
|
||||||
|
raise PdfParseError(self.msg(*arg))
|
|
@ -0,0 +1,146 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
'''
|
||||||
|
Converts pdfrw objects into reportlab objects.
|
||||||
|
|
||||||
|
Designed for and tested with rl 2.3.
|
||||||
|
|
||||||
|
Knows too much about reportlab internals.
|
||||||
|
What can you do?
|
||||||
|
|
||||||
|
The interface to this function is through the makerl() function.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
canv - a reportlab "canvas" (also accepts a "document")
|
||||||
|
pdfobj - a pdfrw PDF object
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A corresponding reportlab object, or if the
|
||||||
|
object is a PDF Form XObject, the name to
|
||||||
|
use with reportlab for the object.
|
||||||
|
|
||||||
|
Will recursively convert all necessary objects.
|
||||||
|
Be careful when converting a page -- if /Parent is set,
|
||||||
|
will recursively convert all pages!
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
1) Original objects are annotated with a
|
||||||
|
derived_rl_obj attribute which points to the
|
||||||
|
reportlab object. This keeps multiple reportlab
|
||||||
|
objects from being generated for the same pdfobj
|
||||||
|
via repeated calls to makerl. This is great for
|
||||||
|
not putting too many objects into the
|
||||||
|
new PDF, but not so good if you are modifying
|
||||||
|
objects for different pages. Then you
|
||||||
|
need to do your own deep copying (of circular
|
||||||
|
structures). You're on your own.
|
||||||
|
|
||||||
|
2) ReportLab seems weird about FormXObjects.
|
||||||
|
They pass around a partial name instead of the
|
||||||
|
object or a reference to it. So we have to
|
||||||
|
reach into reportlab and get a number for
|
||||||
|
a unique name. I guess this is to make it
|
||||||
|
where you can combine page streams with
|
||||||
|
impunity, but that's just a guess.
|
||||||
|
|
||||||
|
3) Updated 1/23/2010 to handle multipass documents
|
||||||
|
(e.g. with a table of contents). These have
|
||||||
|
a different doc object on every pass.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
from reportlab.pdfbase import pdfdoc as rldocmodule
|
||||||
|
from .objects import PdfDict, PdfArray, PdfName
|
||||||
|
from .py23_diffs import convert_store
|
||||||
|
|
||||||
|
RLStream = rldocmodule.PDFStream
|
||||||
|
RLDict = rldocmodule.PDFDictionary
|
||||||
|
RLArray = rldocmodule.PDFArray
|
||||||
|
|
||||||
|
|
||||||
|
def _makedict(rldoc, pdfobj):
|
||||||
|
rlobj = rldict = RLDict()
|
||||||
|
if pdfobj.indirect:
|
||||||
|
rlobj.__RefOnly__ = 1
|
||||||
|
rlobj = rldoc.Reference(rlobj)
|
||||||
|
pdfobj.derived_rl_obj[rldoc] = rlobj, None
|
||||||
|
|
||||||
|
for key, value in pdfobj.iteritems():
|
||||||
|
rldict[key[1:]] = makerl_recurse(rldoc, value)
|
||||||
|
|
||||||
|
return rlobj
|
||||||
|
|
||||||
|
|
||||||
|
def _makestream(rldoc, pdfobj, xobjtype=PdfName.XObject):
|
||||||
|
rldict = RLDict()
|
||||||
|
rlobj = RLStream(rldict, convert_store(pdfobj.stream))
|
||||||
|
|
||||||
|
if pdfobj.Type == xobjtype:
|
||||||
|
shortname = 'pdfrw_%s' % (rldoc.objectcounter + 1)
|
||||||
|
fullname = rldoc.getXObjectName(shortname)
|
||||||
|
else:
|
||||||
|
shortname = fullname = None
|
||||||
|
result = rldoc.Reference(rlobj, fullname)
|
||||||
|
pdfobj.derived_rl_obj[rldoc] = result, shortname
|
||||||
|
|
||||||
|
for key, value in pdfobj.iteritems():
|
||||||
|
rldict[key[1:]] = makerl_recurse(rldoc, value)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _makearray(rldoc, pdfobj):
|
||||||
|
rlobj = rlarray = RLArray([])
|
||||||
|
if pdfobj.indirect:
|
||||||
|
rlobj.__RefOnly__ = 1
|
||||||
|
rlobj = rldoc.Reference(rlobj)
|
||||||
|
pdfobj.derived_rl_obj[rldoc] = rlobj, None
|
||||||
|
|
||||||
|
mylist = rlarray.sequence
|
||||||
|
for value in pdfobj:
|
||||||
|
mylist.append(makerl_recurse(rldoc, value))
|
||||||
|
|
||||||
|
return rlobj
|
||||||
|
|
||||||
|
|
||||||
|
def _makestr(rldoc, pdfobj):
|
||||||
|
assert isinstance(pdfobj, (float, int, str)), repr(pdfobj)
|
||||||
|
# TODO: Add fix for float like in pdfwriter
|
||||||
|
return str(getattr(pdfobj, 'encoded', None) or pdfobj)
|
||||||
|
|
||||||
|
|
||||||
|
def makerl_recurse(rldoc, pdfobj):
|
||||||
|
docdict = getattr(pdfobj, 'derived_rl_obj', None)
|
||||||
|
if docdict is not None:
|
||||||
|
value = docdict.get(rldoc)
|
||||||
|
if value is not None:
|
||||||
|
return value[0]
|
||||||
|
if isinstance(pdfobj, PdfDict):
|
||||||
|
if pdfobj.stream is not None:
|
||||||
|
func = _makestream
|
||||||
|
else:
|
||||||
|
func = _makedict
|
||||||
|
if docdict is None:
|
||||||
|
pdfobj.private.derived_rl_obj = {}
|
||||||
|
elif isinstance(pdfobj, PdfArray):
|
||||||
|
func = _makearray
|
||||||
|
if docdict is None:
|
||||||
|
pdfobj.derived_rl_obj = {}
|
||||||
|
else:
|
||||||
|
func = _makestr
|
||||||
|
return func(rldoc, pdfobj)
|
||||||
|
|
||||||
|
|
||||||
|
def makerl(canv, pdfobj):
|
||||||
|
try:
|
||||||
|
rldoc = canv._doc
|
||||||
|
except AttributeError:
|
||||||
|
rldoc = canv
|
||||||
|
rlobj = makerl_recurse(rldoc, pdfobj)
|
||||||
|
try:
|
||||||
|
name = pdfobj.derived_rl_obj[rldoc][1]
|
||||||
|
except AttributeError:
|
||||||
|
name = None
|
||||||
|
return name or rlobj
|
|
@ -0,0 +1,117 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
||||||
|
# Copyright (C) 2012-2015 Nerijus Mika
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
# Copyright (c) 2006, Mathieu Fenniak
|
||||||
|
# BSD license -- see LICENSE.txt for details
|
||||||
|
'''
|
||||||
|
A small subset of decompression filters. Should add more later.
|
||||||
|
|
||||||
|
I believe, after looking at the code, that portions of the flate
|
||||||
|
PNG predictor were originally transcribed from PyPDF2, which is
|
||||||
|
probably an excellent source of additional filters.
|
||||||
|
'''
|
||||||
|
import array
|
||||||
|
from .objects import PdfDict, PdfName, PdfArray
|
||||||
|
from .errors import log
|
||||||
|
from .py23_diffs import zlib, xrange, from_array, convert_load, convert_store
|
||||||
|
|
||||||
|
|
||||||
|
def streamobjects(mylist, isinstance=isinstance, PdfDict=PdfDict):
|
||||||
|
for obj in mylist:
|
||||||
|
if isinstance(obj, PdfDict) and obj.stream is not None:
|
||||||
|
yield obj
|
||||||
|
|
||||||
|
# Hack so we can import if zlib not available
|
||||||
|
decompressobj = zlib if zlib is None else zlib.decompressobj
|
||||||
|
|
||||||
|
|
||||||
|
def uncompress(mylist, leave_raw=False, warnings=set(),
|
||||||
|
flate=PdfName.FlateDecode, decompress=decompressobj,
|
||||||
|
isinstance=isinstance, list=list, len=len):
|
||||||
|
ok = True
|
||||||
|
for obj in streamobjects(mylist):
|
||||||
|
ftype = obj.Filter
|
||||||
|
if ftype is None:
|
||||||
|
continue
|
||||||
|
if isinstance(ftype, list) and len(ftype) == 1:
|
||||||
|
# todo: multiple filters
|
||||||
|
ftype = ftype[0]
|
||||||
|
parms = obj.DecodeParms or obj.DP
|
||||||
|
if ftype != flate:
|
||||||
|
msg = ('Not decompressing: cannot use filter %s'
|
||||||
|
' with parameters %s') % (repr(ftype), repr(parms))
|
||||||
|
if msg not in warnings:
|
||||||
|
warnings.add(msg)
|
||||||
|
log.warning(msg)
|
||||||
|
ok = False
|
||||||
|
else:
|
||||||
|
dco = decompress()
|
||||||
|
try:
|
||||||
|
data = dco.decompress(convert_store(obj.stream))
|
||||||
|
except Exception as s:
|
||||||
|
error = str(s)
|
||||||
|
else:
|
||||||
|
error = None
|
||||||
|
if isinstance(parms, PdfArray):
|
||||||
|
oldparms = parms
|
||||||
|
parms = PdfDict()
|
||||||
|
for x in oldparms:
|
||||||
|
parms.update(x)
|
||||||
|
if parms:
|
||||||
|
predictor = int(parms.Predictor or 1)
|
||||||
|
columns = int(parms.Columns or 1)
|
||||||
|
colors = int(parms.Colors or 1)
|
||||||
|
bpc = int(parms.BitsPerComponent or 8)
|
||||||
|
if 10 <= predictor <= 15:
|
||||||
|
data, error = flate_png(data, predictor, columns, colors, bpc)
|
||||||
|
elif predictor != 1:
|
||||||
|
error = ('Unsupported flatedecode predictor %s' %
|
||||||
|
repr(predictor))
|
||||||
|
if error is None:
|
||||||
|
assert not dco.unconsumed_tail
|
||||||
|
if dco.unused_data.strip():
|
||||||
|
error = ('Unconsumed compression data: %s' %
|
||||||
|
repr(dco.unused_data[:20]))
|
||||||
|
if error is None:
|
||||||
|
obj.Filter = None
|
||||||
|
obj.stream = data if leave_raw else convert_load(data)
|
||||||
|
else:
|
||||||
|
log.error('%s %s' % (error, repr(obj.indirect)))
|
||||||
|
ok = False
|
||||||
|
return ok
|
||||||
|
|
||||||
|
|
||||||
|
def flate_png(data, predictor=1, columns=1, colors=1, bpc=8):
|
||||||
|
''' PNG prediction is used to make certain kinds of data
|
||||||
|
more compressible. Before the compression, each data
|
||||||
|
byte is either left the same, or is set to be a delta
|
||||||
|
from the previous byte, or is set to be a delta from
|
||||||
|
the previous row. This selection is done on a per-row
|
||||||
|
basis, and is indicated by a compression type byte
|
||||||
|
prepended to each row of data.
|
||||||
|
|
||||||
|
Within more recent PDF files, it is normal to use
|
||||||
|
this technique for Xref stream objects, which are
|
||||||
|
quite regular.
|
||||||
|
'''
|
||||||
|
columnbytes = ((columns * colors * bpc) + 7) // 8
|
||||||
|
data = array.array('B', data)
|
||||||
|
rowlen = columnbytes + 1
|
||||||
|
if predictor == 15:
|
||||||
|
padding = (rowlen - len(data)) % rowlen
|
||||||
|
data.extend([0] * padding)
|
||||||
|
assert len(data) % rowlen == 0
|
||||||
|
rows = xrange(0, len(data), rowlen)
|
||||||
|
for row_index in rows:
|
||||||
|
offset = data[row_index]
|
||||||
|
if offset >= 2:
|
||||||
|
if offset > 2:
|
||||||
|
return None, 'Unsupported PNG filter %d' % offset
|
||||||
|
offset = rowlen if row_index else 0
|
||||||
|
if offset:
|
||||||
|
for index in xrange(row_index + 1, row_index + rowlen):
|
||||||
|
data[index] = (data[index] + data[index - offset]) % 256
|
||||||
|
for row_index in reversed(rows):
|
||||||
|
data.pop(row_index)
|
||||||
|
return from_array(data), None
|
|
@ -0,0 +1,10 @@
|
||||||
|
Notes on releasing, which is not yet fully automated:
|
||||||
|
|
||||||
|
1) Update version number in pdfrw/__init__.py
|
||||||
|
|
||||||
|
2) Use pyroma
|
||||||
|
|
||||||
|
3) https://packaging.python.org/en/latest/distributing.html
|
||||||
|
|
||||||
|
a) python setup.py sdist bdist_wheel
|
||||||
|
b) twine upload dist/*
|
|
@ -0,0 +1,5 @@
|
||||||
|
[bdist_wheel]
|
||||||
|
# This flag says that the code is written to work on both Python 2 and Python
|
||||||
|
# 3. If at all possible, it is good practice to do this. If you cannot, you
|
||||||
|
# will need to generate wheels for each Python version that you support.
|
||||||
|
universal=1
|
|
@ -0,0 +1,40 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
from setuptools import setup
|
||||||
|
from pdfrw import __version__ as version
|
||||||
|
from pdfrw.py23_diffs import convert_load
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name='pdfrw',
|
||||||
|
version=version,
|
||||||
|
description='PDF file reader/writer library',
|
||||||
|
long_description=convert_load(open("README.rst", 'rb').read()),
|
||||||
|
author='Patrick Maupin',
|
||||||
|
author_email='pmaupin@gmail.com',
|
||||||
|
platforms='Independent',
|
||||||
|
url='https://github.com/pmaupin/pdfrw',
|
||||||
|
packages=['pdfrw', 'pdfrw.objects'],
|
||||||
|
license='MIT',
|
||||||
|
classifiers=[
|
||||||
|
'Development Status :: 4 - Beta',
|
||||||
|
'Intended Audience :: Developers',
|
||||||
|
'License :: OSI Approved :: MIT License',
|
||||||
|
'Operating System :: OS Independent',
|
||||||
|
'Programming Language :: Python',
|
||||||
|
'Programming Language :: Python :: 2',
|
||||||
|
'Programming Language :: Python :: 2.6',
|
||||||
|
'Programming Language :: Python :: 2.7',
|
||||||
|
'Programming Language :: Python :: 3',
|
||||||
|
'Programming Language :: Python :: 3.3',
|
||||||
|
'Programming Language :: Python :: 3.4',
|
||||||
|
'Programming Language :: Python :: 3.5',
|
||||||
|
'Programming Language :: Python :: 3.6',
|
||||||
|
'Topic :: Multimedia :: Graphics :: Graphics Conversion',
|
||||||
|
'Topic :: Software Development :: Libraries',
|
||||||
|
'Topic :: Text Processing',
|
||||||
|
'Topic :: Printing',
|
||||||
|
'Topic :: Utilities',
|
||||||
|
],
|
||||||
|
keywords='pdf vector graphics PDF nup watermark split join merge',
|
||||||
|
zip_safe=True,
|
||||||
|
)
|
|
@ -0,0 +1 @@
|
||||||
|
# This file intentionally left blank.
|
|
@ -0,0 +1,81 @@
|
||||||
|
#! /usr/bin/env python2
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
import expected
|
||||||
|
import static_pdfs
|
||||||
|
|
||||||
|
source_pdfs = static_pdfs.pdffiles[0]
|
||||||
|
source_pdfs = dict((os.path.basename(x), x) for x in source_pdfs)
|
||||||
|
|
||||||
|
result_dir = expected.result_dir
|
||||||
|
|
||||||
|
for subdir in sorted(os.listdir(result_dir)):
|
||||||
|
dstd = os.path.join(result_dir, subdir)
|
||||||
|
if not os.path.isdir(dstd):
|
||||||
|
continue
|
||||||
|
for pdffile in sorted(os.listdir(dstd)):
|
||||||
|
testname = '%s/%s' % (subdir, pdffile)
|
||||||
|
srcf = source_pdfs.get(pdffile)
|
||||||
|
dstf = os.path.join(dstd, pdffile)
|
||||||
|
if pdffile not in source_pdfs:
|
||||||
|
print('\n Skipping %s -- source not found' % testname)
|
||||||
|
continue
|
||||||
|
|
||||||
|
with open(dstf, 'rb') as f:
|
||||||
|
data = f.read()
|
||||||
|
hash = hashlib.md5(data).hexdigest()
|
||||||
|
skipset = set((hash, 'skip', 'xfail', 'fail', '!' + hash))
|
||||||
|
if expected.results[testname] & skipset:
|
||||||
|
print('\n Skipping %s -- marked done' % testname)
|
||||||
|
continue
|
||||||
|
if os.path.exists('foobar.pdf'):
|
||||||
|
os.remove('foobar.pdf')
|
||||||
|
builtdiff = False
|
||||||
|
while 1:
|
||||||
|
sys.stdout.write('''
|
||||||
|
Test case %s
|
||||||
|
|
||||||
|
c = compare using imagemagick and okular
|
||||||
|
f = display foobar.pdf (result from comparison)
|
||||||
|
o = display results with okular
|
||||||
|
a = display results with acrobat
|
||||||
|
|
||||||
|
s = mark 'skip' and go to next PDF
|
||||||
|
g = mark as good and go to next PDF
|
||||||
|
b = mark as bad and go to next PDF
|
||||||
|
n = next pdf without marking
|
||||||
|
q = quit
|
||||||
|
--> ''' % testname)
|
||||||
|
sel = raw_input()
|
||||||
|
if sel == 'q':
|
||||||
|
raise SystemExit(0)
|
||||||
|
if sel == 'n':
|
||||||
|
break
|
||||||
|
if sel == 'c':
|
||||||
|
subprocess.call(('compare', '-verbose', srcf, dstf,
|
||||||
|
'foobar.pdf'))
|
||||||
|
builtdiff = True
|
||||||
|
continue
|
||||||
|
if sel == 'f':
|
||||||
|
subprocess.call(('okular', 'foobar.pdf'))
|
||||||
|
continue
|
||||||
|
if sel == 'o':
|
||||||
|
subprocess.call(('okular', srcf, dstf))
|
||||||
|
continue
|
||||||
|
if sel == 'a':
|
||||||
|
if builtdiff:
|
||||||
|
subprocess.call(('acroread', srcf, dstf, 'foobar.pdf'))
|
||||||
|
else:
|
||||||
|
subprocess.call(('acroread', srcf, dstf))
|
||||||
|
continue
|
||||||
|
|
||||||
|
if sel in 'sgb':
|
||||||
|
results = (hash if sel == 'g' else
|
||||||
|
' skip' if sel == 's' else '!'+hash)
|
||||||
|
with open(expected.expectedf, 'a') as f:
|
||||||
|
f.write('%s %s\n' % (testname, results))
|
||||||
|
break
|
|
@ -0,0 +1,41 @@
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
'''
|
||||||
|
Read expected.txt, which should be in the format:
|
||||||
|
|
||||||
|
testname/srcname.pdf validhash
|
||||||
|
|
||||||
|
More than one validhash is allowed (on separate lines),
|
||||||
|
and hash-delimited comments are allowed.
|
||||||
|
'''
|
||||||
|
|
||||||
|
import os
|
||||||
|
import collections
|
||||||
|
from pdfrw.py23_diffs import convert_load
|
||||||
|
|
||||||
|
root_dir = os.path.dirname(__file__)
|
||||||
|
result_dir = 'tmp_results'
|
||||||
|
if os.path.exists('ramdisk'):
|
||||||
|
result_dir = os.path.join('ramdisk', result_dir)
|
||||||
|
result_dir = os.path.join(root_dir, result_dir)
|
||||||
|
|
||||||
|
for sourcef in ('mytests.txt', 'expected.txt'):
|
||||||
|
expectedf = os.path.join(root_dir, sourcef)
|
||||||
|
if os.path.exists(expectedf):
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def results():
|
||||||
|
results = collections.defaultdict(set)
|
||||||
|
with open(expectedf, 'rb') as f:
|
||||||
|
for line in f:
|
||||||
|
line = convert_load(line)
|
||||||
|
line = line.split('#', 1)[0].split()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
key, value = line
|
||||||
|
results[key].add(value)
|
||||||
|
return results
|
||||||
|
results = results()
|
|
@ -0,0 +1,225 @@
|
||||||
|
# Example programs
|
||||||
|
|
||||||
|
examples/4up_b1c400de699af29ea3f1983bb26870ab 1b73c612c40b5082d955ed72f63644bd
|
||||||
|
examples/alter_b1c400de699af29ea3f1983bb26870ab 3c3ee465f45a685ba7098691be05a5ab
|
||||||
|
examples/booklet_b1c400de699af29ea3f1983bb26870ab d711b74110eefb4e9e6bf1a5bea16bfe
|
||||||
|
examples/extract_1975ef8db7355b1d691bc79d0749574b b4f5ee36a288da970ed040a9a733c8b0
|
||||||
|
examples/extract_c5c895deecf7a7565393587e0d61be2b 539aad09ef80907bb396c3260eb87d7b
|
||||||
|
examples/extract_d711b74110eefb4e9e6bf1a5bea16bfe 26ddfd09c6e6002228f06782c8544ac4
|
||||||
|
examples/print_two_b1c400de699af29ea3f1983bb26870ab 73c8a16aba44548c2c06dae6e2551961
|
||||||
|
examples/subset_b1c400de699af29ea3f1983bb26870ab_1-3_5 880a9578197130273ccb51265af08029
|
||||||
|
examples/unspread_d711b74110eefb4e9e6bf1a5bea16bfe 780a9abe26a9de0b5b95ee22c4835e4b
|
||||||
|
|
||||||
|
examples/cat_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c 62bb9b746ff5932d3f1b88942d36a81d
|
||||||
|
examples/rotate_707e3e2d17cbe9ec2273414b3b63f333_270_1-4_7-8_10-50_52-56 7633ba56641115050ba098ecbef8d331
|
||||||
|
examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c fe2330d42b3bfc06212415f295752f0e
|
||||||
|
examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c_-u e43e3ac0afe1cc242549424755dbf612
|
||||||
|
|
||||||
|
# All these are in the poster test
|
||||||
|
examples/subset_1975ef8db7355b1d691bc79d0749574b_21 5057f345f1a1109a0e54276a68e8f8df
|
||||||
|
examples/rotate_5057f345f1a1109a0e54276a68e8f8df_90_1 881f4dc8dcf069e707bf61af95492d86
|
||||||
|
examples/poster_881f4dc8dcf069e707bf61af95492d86 a34be06d22105b6c02394a9f278fec0d
|
||||||
|
|
||||||
|
examples/rl1/4up_b1c400de699af29ea3f1983bb26870ab e21dfdd9ae56ddb261dc3d02bf6da198
|
||||||
|
examples/rl1/booklet_b1c400de699af29ea3f1983bb26870ab 410063b7fbae1c6d5af33758e2b43450
|
||||||
|
examples/rl1/subset_b1c400de699af29ea3f1983bb26870ab_3_5 745f1ac31a18d86afb294a449b72cb98
|
||||||
|
examples/rl1/platypus_pdf_template_b1c400de699af29ea3f1983bb26870ab 88bd087c4dc039ced05faea3920cbec5
|
||||||
|
|
||||||
|
# List things that need work here (typically cause exceptions)
|
||||||
|
|
||||||
|
# Bad info dict -- works otherwise
|
||||||
|
|
||||||
|
simple/b1c400de699af29ea3f1983bb26870ab.pdf ecf2e28de18a724b53670c0d5637ec28
|
||||||
|
repaginate/b1c400de699af29ea3f1983bb26870ab.pdf 4d7d6c5f6e14c6eac1dfc055cebfa499
|
||||||
|
|
||||||
|
# 07b0ba4 is missing an object. Best we can do is report it
|
||||||
|
# (and we do)
|
||||||
|
|
||||||
|
repaginate/07b0ba4cff1c6ff73fd468b04b013457.pdf 993c763e085bce7ecc941ba104f4c892
|
||||||
|
simple/07b0ba4cff1c6ff73fd468b04b013457.pdf 499b9c1b1e1c76b7c5c0d5e3b62889e3
|
||||||
|
|
||||||
|
#b107 has a single page, but with an empty contents dict.
|
||||||
|
|
||||||
|
repaginate/b107669d1dd69eabb89765fabb2cb321.pdf 0652d2da25b50cad75863d0e2bbaa878
|
||||||
|
simple/b107669d1dd69eabb89765fabb2cb321.pdf 56025c06ab8633575ddc6c6990d2fbf1
|
||||||
|
|
||||||
|
# Encrypted files
|
||||||
|
|
||||||
|
repaginate/0ae80b493bc21e6de99f2ff6bbb8bc2c.pdf skip
|
||||||
|
repaginate/6e122f618c27f3aa9a689423e3be6b8d.pdf skip
|
||||||
|
repaginate/7dc787639aa6765214e9ff5494d231ed.pdf skip
|
||||||
|
repaginate/b4b27aaa1f9c7c524298e98be279bebb.pdf skip
|
||||||
|
repaginate/b5b6c6405d7b48418bccf97277957664.pdf skip
|
||||||
|
repaginate/bd0ef57aec16ded45bd89d61b54af0be.pdf skip
|
||||||
|
repaginate/dbb807a878ac1da6b91ac15c9de4e209.pdf skip
|
||||||
|
simple/0ae80b493bc21e6de99f2ff6bbb8bc2c.pdf skip
|
||||||
|
simple/6e122f618c27f3aa9a689423e3be6b8d.pdf skip
|
||||||
|
simple/7dc787639aa6765214e9ff5494d231ed.pdf skip
|
||||||
|
simple/b4b27aaa1f9c7c524298e98be279bebb.pdf skip
|
||||||
|
simple/b5b6c6405d7b48418bccf97277957664.pdf skip
|
||||||
|
simple/bd0ef57aec16ded45bd89d61b54af0be.pdf skip
|
||||||
|
simple/dbb807a878ac1da6b91ac15c9de4e209.pdf skip
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# List good hashes for round-trips here.
|
||||||
|
|
||||||
|
repaginate/06c86654f9a77e82f9adaa0086fc391c.pdf 848966fe40a1e3de842f82700dc6d67b
|
||||||
|
repaginate/08f69084d72dabc5dfdcf5c1ff2a719f.pdf b8c60878b0e0ce81cb6e8777038166b1
|
||||||
|
repaginate/09715ec1a7b0f3a7ae02b3046f627b9f.pdf daf7cff9c0a15bbb347489f9fbda25f8
|
||||||
|
repaginate/0a61de50b5ee0ea4d5d69c95dab817a3.pdf c6cd38b1131c4b856f60ebfcf51da6f5
|
||||||
|
repaginate/1975ef8db7355b1d691bc79d0749574b.pdf 43433398ccb1edaaee734f4949a5cc3c
|
||||||
|
repaginate/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 20dc3be2affe9082564c01b1146d7598
|
||||||
|
repaginate/1f5dd128c3757420a881a155f2f8ace3.pdf 7130f1568526247895856806b3879db4
|
||||||
|
repaginate/22628a7ed578b622520325673ab2a4f2.pdf e312c9c588a5ccdb1a11ac37149b178b
|
||||||
|
repaginate/2ac7c68e26a8ef797aead15e4875cc6d.pdf e7344551183415d6257e2cab2aef4a61
|
||||||
|
repaginate/295d26e61a85635433f8e4b768953f60.pdf a89a9fa39812ecd9fa5d6b9e785f389d
|
||||||
|
repaginate/2d31f356c37dadd04b83ecc4e9a739a0.pdf bc04b61b41cb51f6a1c1da79fb387795
|
||||||
|
repaginate/2fac0d9a189ca5fcef8626153d050be8.pdf 95fe3d9258ace5bdccb95a55c2c8cb22
|
||||||
|
repaginate/319c998910453bc44d40c7748cd2cb79.pdf c0da6bf6db273bdb1385f408dcf063d0
|
||||||
|
repaginate/35df0b8cff4afec0c08f08c6a5bc9857.pdf 3568e1c885a461b350c790ec5b729af3
|
||||||
|
repaginate/365b9c95574ee8944370fe286905d0e8.pdf 84e5fc0d4f30ff8db05780fd244d9cf0
|
||||||
|
repaginate/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e
|
||||||
|
repaginate/49e31fd074eca6af981d78d42d0078ec.pdf 77fd3fa86c7c0166a373b66cfef357d2
|
||||||
|
repaginate/536dfc6fbadd87c03eb59375d091eb53.pdf afc90878b1306483dbde37c3a50b6a45
|
||||||
|
repaginate/569f8094597bbe5b58efc3a7c6e14e87.pdf 894bf526c0a73ab70ebfd9bf3d614315
|
||||||
|
repaginate/5f0cff36d0ad74536a6513a98a755016.pdf 3298a3a13439764102395a34d571ff69
|
||||||
|
repaginate/5f265db2736850782aeaba2571a3c749.pdf 2e3046813ce6e40a39bd759a3c8a3c8c
|
||||||
|
repaginate/6a42c8c79b807bf164d31071749e07b0.pdf bf00d5e44869ae59eb859860d7d5373f
|
||||||
|
repaginate/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 612cdd84eeac797a1c42fc91756b6d9e
|
||||||
|
repaginate/7037a992b80b60f0294016037baa9292.pdf dd41b0104f185206b51e7ffe5b07d261
|
||||||
|
repaginate/707e3e2d17cbe9ec2273414b3b63f333.pdf df4d756e2230c333f0c58ad354b5b51c
|
||||||
|
repaginate/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2
|
||||||
|
repaginate/72eb207b8f882618899aa7a65d3cecda.pdf 0b64f19a8a39fadfa2a3eec3f1a01233
|
||||||
|
repaginate/97ba0a239cefa0dc727c2f1be050ec6c.pdf a94fe7183ce8979174b2ac16dcd9b1ea
|
||||||
|
repaginate/9d8626d18b1d8807d271e6ffc409446a.pdf cdfcf8add1af9e612ba1a2ee06a6a273
|
||||||
|
repaginate/9f98322c243fe67726d56ccfa8e0885b.pdf 69503ac140a1e4f1322f9350646e3dae
|
||||||
|
repaginate/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 8cddb0f9741f7515107b1bce5dc90c83
|
||||||
|
repaginate/c5c895deecf7a7565393587e0d61be2b.pdf 59e350c6f7d7b89fab36a4019bb526fd
|
||||||
|
repaginate/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf 3623b7f200818c63cb6838f9678a4840
|
||||||
|
repaginate/d6fd9567078b48c86710e9c49173781f.pdf 874b532f61139261f71afb5987dd2a68
|
||||||
|
repaginate/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 7d3c3ae13cc7d53e7fa6ef046e15dbaa
|
||||||
|
repaginate/ec00d5825f47b9d0faa953b1709163c3.pdf 8e6a481476c2b3bdd64ce8e36f8fe273
|
||||||
|
repaginate/ed81787b83cc317c9f049643b853bea3.pdf 4636b68f294302417b81aaaadde1c73d
|
||||||
|
|
||||||
|
|
||||||
|
simple/06c86654f9a77e82f9adaa0086fc391c.pdf 6e2a2e063de895d28dfea9aacb9fe469
|
||||||
|
simple/08f69084d72dabc5dfdcf5c1ff2a719f.pdf 5a41601f6033356539e623091a3f79ef
|
||||||
|
simple/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334
|
||||||
|
simple/09715ec1a7b0f3a7ae02b3046f627b9f.pdf c4e4b3b725bd5fc3b008f1ac6251ad1c
|
||||||
|
simple/1975ef8db7355b1d691bc79d0749574b.pdf 475c28c9588f3a7f6110d30f391758c4
|
||||||
|
simple/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 3f17f19fd92adf01998bb13a0ee52b92
|
||||||
|
simple/1f5dd128c3757420a881a155f2f8ace3.pdf b0d01f9d6ac156326aeb14b940aa73e7
|
||||||
|
simple/22628a7ed578b622520325673ab2a4f2.pdf 1163cec415728899e997a29be465d02d
|
||||||
|
simple/295d26e61a85635433f8e4b768953f60.pdf fe3b8960c7f877db05c7cd12c9c6e097
|
||||||
|
simple/2ac7c68e26a8ef797aead15e4875cc6d.pdf 2623eae06eada9587574f8ddd7fc80fa
|
||||||
|
simple/2d31f356c37dadd04b83ecc4e9a739a0.pdf 9af4794d366fbd5840836e6612ceedd2
|
||||||
|
simple/2fac0d9a189ca5fcef8626153d050be8.pdf 458501ecda909b00262b9654f0b09ebf
|
||||||
|
simple/319c998910453bc44d40c7748cd2cb79.pdf 8c84e36ec1db8c1dbfaa312646e000b4
|
||||||
|
simple/35df0b8cff4afec0c08f08c6a5bc9857.pdf 0a2926c23ad916c449d5dadcfa9d38ef
|
||||||
|
simple/365b9c95574ee8944370fe286905d0e8.pdf cf3bfac41f410bf5bd657e3f906dfbc6
|
||||||
|
simple/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e
|
||||||
|
simple/49e31fd074eca6af981d78d42d0078ec.pdf 2c316537a5b0917634cbbdc5b91511df
|
||||||
|
simple/536dfc6fbadd87c03eb59375d091eb53.pdf 319851765c70ba103c4191f7ec2148db
|
||||||
|
simple/569f8094597bbe5b58efc3a7c6e14e87.pdf 025f1bf95cc537c36b8c3a044758b86c
|
||||||
|
simple/5f0cff36d0ad74536a6513a98a755016.pdf 8476fd75e75394fcbbe02816d0640e7d
|
||||||
|
simple/5f265db2736850782aeaba2571a3c749.pdf d4d2e93ab22e866c86e32da84421f6f9
|
||||||
|
simple/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013
|
||||||
|
simple/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf fe8dd16dd7fef40338140e0610d0cbbf
|
||||||
|
simple/7037a992b80b60f0294016037baa9292.pdf 6a2ef24e5f74dd74969ff8cefdfc6a05
|
||||||
|
simple/707e3e2d17cbe9ec2273414b3b63f333.pdf fb6a8eb3cdc2fbef125babe8815f3b70
|
||||||
|
simple/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2
|
||||||
|
simple/72eb207b8f882618899aa7a65d3cecda.pdf 4ce7ff29531cc417c26389af28dc1c5e
|
||||||
|
simple/97ba0a239cefa0dc727c2f1be050ec6c.pdf c24873bab85b8ecc7c5433d8d802bceb
|
||||||
|
simple/9d8626d18b1d8807d271e6ffc409446a.pdf 2358d654bf20d2b9d179ab009a615c4e
|
||||||
|
simple/9f98322c243fe67726d56ccfa8e0885b.pdf 9290b4c32f005e1e4c7f431955246c4c
|
||||||
|
simple/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 6b406128e0ed1ac23dc5a0ee34d1f717
|
||||||
|
simple/c5c895deecf7a7565393587e0d61be2b.pdf 2cc3c75e56d5dd562ca5b1f994bd9d5c
|
||||||
|
simple/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf 2083f0e55cf06d88df02956a21bfef23
|
||||||
|
simple/d6fd9567078b48c86710e9c49173781f.pdf 77464ec5cfdacb61a73b506bc4945631
|
||||||
|
simple/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 5bc96989bc4f4b6438da953443336124
|
||||||
|
simple/ec00d5825f47b9d0faa953b1709163c3.pdf 708f66049169c28ac39b0553908dc318
|
||||||
|
simple/ed81787b83cc317c9f049643b853bea3.pdf c227d627217dc6808c50e80063734d27
|
||||||
|
|
||||||
|
|
||||||
|
decompress/06c86654f9a77e82f9adaa0086fc391c.pdf 6e2a2e063de895d28dfea9aacb9fe469
|
||||||
|
decompress/07b0ba4cff1c6ff73fd468b04b013457.pdf 499b9c1b1e1c76b7c5c0d5e3b62889e3
|
||||||
|
decompress/08f69084d72dabc5dfdcf5c1ff2a719f.pdf ccadb859eff77d525bf86f6d821ccf1b
|
||||||
|
decompress/09715ec1a7b0f3a7ae02b3046f627b9f.pdf 2b9c8b26a92c7645cfefa1bfa8a8ab36
|
||||||
|
decompress/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334
|
||||||
|
decompress/1975ef8db7355b1d691bc79d0749574b.pdf a7d5eaf0a4259352898047f284e20b90
|
||||||
|
decompress/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 40d1cc7e26213510319b519032aff637
|
||||||
|
decompress/1f5dd128c3757420a881a155f2f8ace3.pdf b0d01f9d6ac156326aeb14b940aa73e7
|
||||||
|
decompress/22628a7ed578b622520325673ab2a4f2.pdf b68c7bf46ad4b70addc3369ba669dc7b
|
||||||
|
decompress/295d26e61a85635433f8e4b768953f60.pdf 6f2ae8fb0ff853ed63537d8767ce13ad
|
||||||
|
decompress/2ac7c68e26a8ef797aead15e4875cc6d.pdf d8d5589991ce15c834f35b340e7147a9
|
||||||
|
decompress/2d31f356c37dadd04b83ecc4e9a739a0.pdf 5a6b732690c42f07ae6a41c37cf28ff3
|
||||||
|
decompress/2fac0d9a189ca5fcef8626153d050be8.pdf 998366ad30becd31bed711ba78c59a7f
|
||||||
|
decompress/319c998910453bc44d40c7748cd2cb79.pdf 7933a591caf3d49e45a42733bc48f99e
|
||||||
|
decompress/35df0b8cff4afec0c08f08c6a5bc9857.pdf e339ae7747898d2faba270473171692a
|
||||||
|
decompress/365b9c95574ee8944370fe286905d0e8.pdf 9da0100b5844c86e93093d0fbc78b3f6
|
||||||
|
decompress/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e
|
||||||
|
decompress/49e31fd074eca6af981d78d42d0078ec.pdf 4e9bf31753ff7232de4c612a31bd21fc
|
||||||
|
decompress/536dfc6fbadd87c03eb59375d091eb53.pdf f755d2ef6052270121168d2341ad04b6
|
||||||
|
decompress/569f8094597bbe5b58efc3a7c6e14e87.pdf aa782a7d553ec767ab61517996337f58
|
||||||
|
decompress/5f0cff36d0ad74536a6513a98a755016.pdf 9caae4e3a21eba9e4aa76620e7508d56
|
||||||
|
decompress/5f265db2736850782aeaba2571a3c749.pdf 836abcf6e6e1d39ad96481eb20e9b149
|
||||||
|
decompress/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013
|
||||||
|
decompress/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 226773cac79e1a5fed1379a0501a5df0
|
||||||
|
decompress/7037a992b80b60f0294016037baa9292.pdf c9a3602b26d82ae145d9f5822125a158
|
||||||
|
decompress/707e3e2d17cbe9ec2273414b3b63f333.pdf 3250a56e14a9855eccd67bb347808d24
|
||||||
|
decompress/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2
|
||||||
|
decompress/72eb207b8f882618899aa7a65d3cecda.pdf a4366874fb6db1d9a0c998361ea32b8d
|
||||||
|
decompress/97ba0a239cefa0dc727c2f1be050ec6c.pdf c24873bab85b8ecc7c5433d8d802bceb
|
||||||
|
decompress/9d8626d18b1d8807d271e6ffc409446a.pdf 6498bd354bb221516517a4c49bcb94f6
|
||||||
|
decompress/9f98322c243fe67726d56ccfa8e0885b.pdf 4b53b63b0779b81d8f9569e66ca3d8ee
|
||||||
|
decompress/b107669d1dd69eabb89765fabb2cb321.pdf 56025c06ab8633575ddc6c6990d2fbf1
|
||||||
|
decompress/b1c400de699af29ea3f1983bb26870ab.pdf 08a5de62129a96d8d9a8f27052bfb227
|
||||||
|
decompress/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 8e0eb14c12fc89e7cbb4001861d7198f
|
||||||
|
decompress/c5c895deecf7a7565393587e0d61be2b.pdf 2cc3c75e56d5dd562ca5b1f994bd9d5c
|
||||||
|
decompress/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf aaed7215c60dbf19bb4fefe88602196a
|
||||||
|
decompress/d6fd9567078b48c86710e9c49173781f.pdf 1fd1b4bc184e64ea6260c30261adf9c4
|
||||||
|
decompress/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 62b87ec47f1b93d75c32d0c78b6c2380
|
||||||
|
decompress/ec00d5825f47b9d0faa953b1709163c3.pdf 708f66049169c28ac39b0553908dc318
|
||||||
|
decompress/ed81787b83cc317c9f049643b853bea3.pdf 5c0a3bc5b19d58d48767bff8f31daae0
|
||||||
|
|
||||||
|
compress/06c86654f9a77e82f9adaa0086fc391c.pdf b6fb771b49971f2b63a197f3ef1531aa
|
||||||
|
compress/07b0ba4cff1c6ff73fd468b04b013457.pdf 499b9c1b1e1c76b7c5c0d5e3b62889e3
|
||||||
|
compress/08f69084d72dabc5dfdcf5c1ff2a719f.pdf 3e7e53a92f96d52bbffe3ffa03d7b11e
|
||||||
|
compress/09715ec1a7b0f3a7ae02b3046f627b9f.pdf 563ffde527978517393d9166b02c17d3
|
||||||
|
compress/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334
|
||||||
|
compress/1975ef8db7355b1d691bc79d0749574b.pdf d505caa75f8becea1a1c810f4a143976
|
||||||
|
compress/1c2af1d2b0db6cac3c8e558a26efd38b.pdf b78f4e45aef4149a068a0225ea1be88c
|
||||||
|
compress/1f5dd128c3757420a881a155f2f8ace3.pdf 22148c2a65129f936b8e8c67397e5bf6
|
||||||
|
compress/22628a7ed578b622520325673ab2a4f2.pdf 54ec1fa64e64bfd146f13001444346f4
|
||||||
|
compress/295d26e61a85635433f8e4b768953f60.pdf 2ed8eb04a8c66138883a43917cd9c0c5
|
||||||
|
compress/2ac7c68e26a8ef797aead15e4875cc6d.pdf efe942d1e5b9f2f139c7e1f2e46ced24
|
||||||
|
compress/2d31f356c37dadd04b83ecc4e9a739a0.pdf eedc938e6782e1d15755b5c54fffc17c
|
||||||
|
compress/2fac0d9a189ca5fcef8626153d050be8.pdf 2d1b8e82cdc82c82bec3969acf026d30
|
||||||
|
compress/319c998910453bc44d40c7748cd2cb79.pdf 5b9ca8444a17db8cb6fa427da7a89e44
|
||||||
|
compress/35df0b8cff4afec0c08f08c6a5bc9857.pdf 07c064df0fc0fd0c80c4a196b4c38403
|
||||||
|
compress/365b9c95574ee8944370fe286905d0e8.pdf 1b98e92f74c2f5324cce5fc8fbe46c15
|
||||||
|
compress/4805fdcd7e142e8df3c04c6ba06025af.pdf 4aa2e922739ba865da30a9917ddffe8e
|
||||||
|
compress/49e31fd074eca6af981d78d42d0078ec.pdf 7422b3d205650552ff81bc06c89c13ba
|
||||||
|
compress/536dfc6fbadd87c03eb59375d091eb53.pdf c18b0f0f8e633fe15b17772c701a76a9
|
||||||
|
compress/569f8094597bbe5b58efc3a7c6e14e87.pdf 3ee711f7fc678787346dca5d06ee5192
|
||||||
|
compress/5f0cff36d0ad74536a6513a98a755016.pdf bd2a1edf6299d5dc2e1ad6b5fc8bcc20
|
||||||
|
compress/5f265db2736850782aeaba2571a3c749.pdf bb4898beac50171de7502f13925af80c
|
||||||
|
compress/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013
|
||||||
|
compress/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 1c3fbae41e7cad7deca13fab93514bc7
|
||||||
|
compress/7037a992b80b60f0294016037baa9292.pdf 9182a9765544e4a91404db65a6f951d7
|
||||||
|
compress/707e3e2d17cbe9ec2273414b3b63f333.pdf 0e75dda73bf18d9968499277ab1a367e
|
||||||
|
compress/71a751ce2d93a6a5d6ff21735b701fb7.pdf faa7eb31789a3789f65de30a4e58e594
|
||||||
|
compress/72eb207b8f882618899aa7a65d3cecda.pdf 0155549fc04357220cc6be541dda7bc1
|
||||||
|
compress/97ba0a239cefa0dc727c2f1be050ec6c.pdf 067bfee3b2bd9c250e7c4157ff543a81
|
||||||
|
compress/9d8626d18b1d8807d271e6ffc409446a.pdf 7c124d2d0b0c7b21cce91740dfb2a8fd
|
||||||
|
compress/9f98322c243fe67726d56ccfa8e0885b.pdf 3167fa11a3f1f4a06f90294b21e101b7
|
||||||
|
compress/b107669d1dd69eabb89765fabb2cb321.pdf 56025c06ab8633575ddc6c6990d2fbf1
|
||||||
|
compress/b1c400de699af29ea3f1983bb26870ab.pdf 6eaeef32b0e28959e7681c8b02d8814f
|
||||||
|
compress/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 6ef82921011eb79a9d860214e213c868
|
||||||
|
compress/c5c895deecf7a7565393587e0d61be2b.pdf 30d87ac6aa59d65169c389ee3badbca8
|
||||||
|
compress/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf e4c768be930e9980c970d51d5f447e24
|
||||||
|
compress/d6fd9567078b48c86710e9c49173781f.pdf cbc8922b8bea08928463b287767ec229
|
||||||
|
compress/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf e893e407b3c2366d4ca822ce80b45c2c
|
||||||
|
compress/ec00d5825f47b9d0faa953b1709163c3.pdf 9ba3db0dedec74c3d2a6f033f1b22a81
|
||||||
|
compress/ed81787b83cc317c9f049643b853bea3.pdf 2ceda401f68a44a3fb1da4e0f9dfc578
|
|
@ -0,0 +1,5 @@
|
||||||
|
import cProfile
|
||||||
|
import unittest
|
||||||
|
import test_roundtrip
|
||||||
|
|
||||||
|
cProfile.run('unittest.main(test_roundtrip)')
|
|
@ -0,0 +1,195 @@
|
||||||
|
#! /usr/bin/env python
|
||||||
|
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
'''
|
||||||
|
Run from the directory above like so:
|
||||||
|
|
||||||
|
python -m tests.test_examples
|
||||||
|
|
||||||
|
A PDF that has been determined to be good or bad
|
||||||
|
should be added to expected.txt with either a good
|
||||||
|
checksum, or just the word "fail".
|
||||||
|
|
||||||
|
These tests are incomplete, but they allow us to try
|
||||||
|
out various PDFs. There is a collection of difficult
|
||||||
|
PDFs available on github.
|
||||||
|
|
||||||
|
In order to use them:
|
||||||
|
|
||||||
|
1) Insure that github.com/pmaupin/static_pdfs is on your path.
|
||||||
|
|
||||||
|
2) Use the imagemagick compare program to look at differences
|
||||||
|
between the static_pdfs/global directory and the tmp_results
|
||||||
|
directory after you run this.
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import hashlib
|
||||||
|
import subprocess
|
||||||
|
import static_pdfs
|
||||||
|
import expected
|
||||||
|
|
||||||
|
from pdfrw.py23_diffs import convert_store
|
||||||
|
from pdfrw import PdfReader, PdfWriter
|
||||||
|
|
||||||
|
try:
|
||||||
|
import unittest2 as unittest
|
||||||
|
except ImportError:
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
|
||||||
|
prog_dir = os.path.join(expected.root_dir, '..', 'examples', '%s.py')
|
||||||
|
prog_dir = os.path.abspath(prog_dir)
|
||||||
|
dstdir = os.path.join(expected.result_dir, 'examples')
|
||||||
|
hashfile = os.path.join(expected.result_dir, 'hashes.txt')
|
||||||
|
|
||||||
|
lookup = static_pdfs.pdffiles[0]
|
||||||
|
lookup = dict((os.path.basename(x)[:-4], x) for x in lookup)
|
||||||
|
|
||||||
|
|
||||||
|
class TestOnePdf(unittest.TestCase):
|
||||||
|
|
||||||
|
def do_test(self, params, prev_results=[''], scrub=False):
|
||||||
|
params = params.split()
|
||||||
|
hashkey = 'examples/%s' % '_'.join(params)
|
||||||
|
params = [lookup.get(x, x) for x in params]
|
||||||
|
progname = params[0]
|
||||||
|
params[0] = prog_dir % progname
|
||||||
|
srcf = params[1]
|
||||||
|
params.insert(0, sys.executable)
|
||||||
|
subdir, progname = os.path.split(progname)
|
||||||
|
subdir = os.path.join(dstdir, subdir)
|
||||||
|
if not os.path.exists(subdir):
|
||||||
|
os.makedirs(subdir)
|
||||||
|
os.chdir(subdir)
|
||||||
|
dstf = '%s.%s' % (progname, os.path.basename(srcf))
|
||||||
|
scrub = scrub and dstf
|
||||||
|
dstf = dstf if not scrub else 'final.%s' % dstf
|
||||||
|
hash = '------no-file-generated---------'
|
||||||
|
expects = expected.results[hashkey]
|
||||||
|
|
||||||
|
# If the test has been deliberately skipped,
|
||||||
|
# we are done. Otherwise, execute it even
|
||||||
|
# if we don't know about it yet, so we have
|
||||||
|
# results to compare.
|
||||||
|
|
||||||
|
result = 'fail'
|
||||||
|
size = 0
|
||||||
|
try:
|
||||||
|
if 'skip' in expects:
|
||||||
|
result = 'skip requested'
|
||||||
|
return self.skipTest(result)
|
||||||
|
elif 'xfail' in expects:
|
||||||
|
result = 'xfail requested'
|
||||||
|
return self.fail(result)
|
||||||
|
|
||||||
|
exists = os.path.exists(dstf)
|
||||||
|
if expects or not exists:
|
||||||
|
if exists:
|
||||||
|
os.remove(dstf)
|
||||||
|
if scrub and os.path.exists(scrub):
|
||||||
|
os.remove(scrub)
|
||||||
|
subprocess.call(params)
|
||||||
|
if scrub:
|
||||||
|
PdfWriter(dstf).addpages(PdfReader(scrub).pages).write()
|
||||||
|
with open(dstf, 'rb') as f:
|
||||||
|
data = f.read()
|
||||||
|
size = len(data)
|
||||||
|
if data:
|
||||||
|
hash = hashlib.md5(data).hexdigest()
|
||||||
|
lookup[hash] = dstf
|
||||||
|
prev_results[0] = hash
|
||||||
|
else:
|
||||||
|
os.remove(dstf)
|
||||||
|
if expects:
|
||||||
|
if len(expects) == 1:
|
||||||
|
expects, = expects
|
||||||
|
self.assertEqual(hash, expects)
|
||||||
|
else:
|
||||||
|
self.assertIn(hash, expects)
|
||||||
|
result = 'pass'
|
||||||
|
else:
|
||||||
|
result = 'skip'
|
||||||
|
self.skipTest('No hash available')
|
||||||
|
finally:
|
||||||
|
result = '%8d %-20s %s %s\n' % (size, result, hashkey, hash)
|
||||||
|
with open(hashfile, 'ab') as f:
|
||||||
|
f.write(convert_store(result))
|
||||||
|
|
||||||
|
def test_4up(self):
|
||||||
|
self.do_test('4up b1c400de699af29ea3f1983bb26870ab')
|
||||||
|
|
||||||
|
def test_booklet_unspread(self):
|
||||||
|
prev = [None]
|
||||||
|
self.do_test('booklet b1c400de699af29ea3f1983bb26870ab', prev)
|
||||||
|
if prev[0] is not None:
|
||||||
|
self.do_test('unspread ' + prev[0])
|
||||||
|
self.do_test('extract ' + prev[0])
|
||||||
|
|
||||||
|
def test_print_two(self):
|
||||||
|
self.do_test('print_two b1c400de699af29ea3f1983bb26870ab')
|
||||||
|
|
||||||
|
def test_watermarks(self):
|
||||||
|
self.do_test('watermark b1c400de699af29ea3f1983bb26870ab '
|
||||||
|
'06c86654f9a77e82f9adaa0086fc391c')
|
||||||
|
self.do_test('watermark b1c400de699af29ea3f1983bb26870ab '
|
||||||
|
'06c86654f9a77e82f9adaa0086fc391c -u')
|
||||||
|
|
||||||
|
def test_subset(self):
|
||||||
|
self.do_test('subset b1c400de699af29ea3f1983bb26870ab 1-3 5')
|
||||||
|
|
||||||
|
def test_alter(self):
|
||||||
|
self.do_test('alter b1c400de699af29ea3f1983bb26870ab')
|
||||||
|
|
||||||
|
def test_cat(self):
|
||||||
|
self.do_test('cat b1c400de699af29ea3f1983bb26870ab '
|
||||||
|
'06c86654f9a77e82f9adaa0086fc391c')
|
||||||
|
|
||||||
|
def test_rotate(self):
|
||||||
|
self.do_test('rotate 707e3e2d17cbe9ec2273414b3b63f333 '
|
||||||
|
'270 1-4 7-8 10-50 52-56')
|
||||||
|
|
||||||
|
def test_poster(self):
|
||||||
|
prev = [None]
|
||||||
|
self.do_test('subset 1975ef8db7355b1d691bc79d0749574b 21', prev)
|
||||||
|
self.do_test('rotate %s 90 1' % prev[0], prev)
|
||||||
|
self.do_test('poster %s' % prev[0], prev)
|
||||||
|
|
||||||
|
def test_extract(self):
|
||||||
|
self.do_test('extract 1975ef8db7355b1d691bc79d0749574b')
|
||||||
|
self.do_test('extract c5c895deecf7a7565393587e0d61be2b')
|
||||||
|
|
||||||
|
def test_rl1_4up(self):
|
||||||
|
if sys.version_info < (2, 7):
|
||||||
|
return
|
||||||
|
self.do_test('rl1/4up b1c400de699af29ea3f1983bb26870ab',
|
||||||
|
scrub=True)
|
||||||
|
|
||||||
|
def test_rl1_booklet(self):
|
||||||
|
if sys.version_info < (2, 7):
|
||||||
|
return
|
||||||
|
self.do_test('rl1/booklet b1c400de699af29ea3f1983bb26870ab',
|
||||||
|
scrub=True)
|
||||||
|
|
||||||
|
def test_rl1_subset(self):
|
||||||
|
if sys.version_info < (2, 7):
|
||||||
|
return
|
||||||
|
self.do_test('rl1/subset b1c400de699af29ea3f1983bb26870ab 3 5',
|
||||||
|
scrub=True)
|
||||||
|
|
||||||
|
def test_rl1_platypus(self):
|
||||||
|
if sys.version_info < (2, 7):
|
||||||
|
return
|
||||||
|
self.do_test('rl1/platypus_pdf_template b1c400de699af29ea3f1983bb26870ab',
|
||||||
|
scrub=True)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
unittest.main()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
|
@ -0,0 +1,39 @@
|
||||||
|
#! /usr/bin/env python
|
||||||
|
# encoding: utf-8
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas
|
||||||
|
# 2016 James Laird-Wah, Sydney, Australia
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
'''
|
||||||
|
Run from the directory above like so:
|
||||||
|
python -m tests.test_pdfstring
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
from pdfrw import PdfDict, PdfName
|
||||||
|
from pdfrw.objects import PdfIndirect
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
|
||||||
|
class TestPdfDicts(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_indirect_set_get(self):
|
||||||
|
io = PdfIndirect((1,2,3))
|
||||||
|
io.value = 42
|
||||||
|
d = PdfDict()
|
||||||
|
d.Name = io
|
||||||
|
test, = (x for x in dict.values(d))
|
||||||
|
self.assertEqual(test, io)
|
||||||
|
v = d['/Name']
|
||||||
|
self.assertEqual(v, io.value)
|
||||||
|
test, = d
|
||||||
|
self.assertEqual(type(test), type(PdfName.Name))
|
||||||
|
|
||||||
|
def main():
|
||||||
|
unittest.main()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
|
@ -0,0 +1,28 @@
|
||||||
|
#! /usr/bin/env python
|
||||||
|
import static_pdfs
|
||||||
|
|
||||||
|
from pdfrw import PdfReader
|
||||||
|
|
||||||
|
try:
|
||||||
|
import unittest2 as unittest
|
||||||
|
except ImportError:
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
|
||||||
|
class TestPdfReaderInit(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_fname_binary_filelike(self):
|
||||||
|
with open(static_pdfs.pdffiles[0][0], 'rb') as pdf_file:
|
||||||
|
PdfReader(pdf_file)
|
||||||
|
|
||||||
|
def test_fdata_binary(self):
|
||||||
|
with open(static_pdfs.pdffiles[0][0], 'rb') as pdf_file:
|
||||||
|
pdf_bytes = pdf_file.read()
|
||||||
|
PdfReader(fdata=pdf_bytes)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
unittest.main()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
|
@ -0,0 +1,120 @@
|
||||||
|
#! /usr/bin/env python
|
||||||
|
# encoding: utf-8
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas
|
||||||
|
# 2016 James Laird-Wah, Sydney, Australia
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
'''
|
||||||
|
Run from the directory above like so:
|
||||||
|
python -m tests.test_pdfstring
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
from pdfrw import PdfString
|
||||||
|
from pdfrw.py23_diffs import convert_store
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
|
||||||
|
class TestBaseEncoding(unittest.TestCase):
|
||||||
|
|
||||||
|
def encode(self, value):
|
||||||
|
x = PdfString.encode(value)
|
||||||
|
if isinstance(value, type(u'')):
|
||||||
|
y = PdfString.from_unicode(value)
|
||||||
|
else:
|
||||||
|
y = PdfString.from_bytes(value)
|
||||||
|
self.assertEqual(x, y)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def decode(self, value):
|
||||||
|
s = PdfString(value)
|
||||||
|
x = s.to_unicode()
|
||||||
|
y = s.decode()
|
||||||
|
self.assertEqual(x, y)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def decode_bytes(self, decode_this, expected):
|
||||||
|
""" Decode to bytes"""
|
||||||
|
self.assertEqual(PdfString(decode_this).to_bytes(),
|
||||||
|
convert_store(expected))
|
||||||
|
|
||||||
|
def roundtrip(self, value, expected=None):
|
||||||
|
result = self.encode(value)
|
||||||
|
self.assertEqual(value, self.decode(result))
|
||||||
|
if expected is not None:
|
||||||
|
self.assertEqual(result, expected)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def test_doubleslash(self):
|
||||||
|
self.roundtrip('\\')
|
||||||
|
self.roundtrip(r'\\')
|
||||||
|
|
||||||
|
def test_unicode_encoding(self):
|
||||||
|
# These chars are in PdfDocEncoding
|
||||||
|
self.assertEqual(self.roundtrip(u'PDF™©®')[0], '(')
|
||||||
|
# These chars are not in PdfDocEncoding
|
||||||
|
self.assertEqual(self.roundtrip(u'δΩσ')[0], '<')
|
||||||
|
# Check that we're doing a reasonable encoding
|
||||||
|
# Might want to change this later if we change the definition of reasonable
|
||||||
|
self.roundtrip(u'(\n\u00FF', '(\\(\n\xff)')
|
||||||
|
self.roundtrip(u'(\n\u0101', '<FEFF0028000A0101>')
|
||||||
|
|
||||||
|
|
||||||
|
def test_constructor(self):
|
||||||
|
obj = PdfString('hello')
|
||||||
|
|
||||||
|
def test_continuation(self):
|
||||||
|
# See PDF 1.7 ref section 3.2 page 55
|
||||||
|
s1 = PdfString('(These two strings are the same.)')
|
||||||
|
self.assertEqual(s1.decode(), s1[1:-1])
|
||||||
|
s2 = PdfString('(These \\\ntwo strings \\\nare the same.)')
|
||||||
|
self.assertEqual(s1.decode(), s2.decode())
|
||||||
|
s2 = PdfString(s2.replace('\n', '\r'))
|
||||||
|
self.assertEqual(s1.decode(), s2.decode())
|
||||||
|
s2 = PdfString(s2.replace('\r', '\r\n'))
|
||||||
|
self.assertEqual(s1.decode(), s2.decode())
|
||||||
|
|
||||||
|
def test_hex_whitespace(self):
|
||||||
|
# See PDF 1.7 ref section 3.2 page 56
|
||||||
|
self.assertEqual(self.decode('<41 \n\r\t\f\v42>'), 'AB')
|
||||||
|
|
||||||
|
def test_unicode_escaped_decode(self):
|
||||||
|
# Some PDF producers happily put unicode strings in PdfDocEncoding,
|
||||||
|
# because the Unicode BOM and \0 are valid code points
|
||||||
|
decoded = self.decode('(\xfe\xff\0h\0e\0l\0l\0o)')
|
||||||
|
self.assertEqual(decoded, "hello")
|
||||||
|
|
||||||
|
|
||||||
|
def test_unescaping(self):
|
||||||
|
self.decode_bytes(r'( \( \) \\ \n \t \f \r \r\n \\n)',
|
||||||
|
' ( ) \\ \n \t \f \r \r\n \\n')
|
||||||
|
|
||||||
|
self.decode_bytes(r'(\b\010\10)', '\b\b\b')
|
||||||
|
self.decode_bytes('(\\n\n\\r\r\\t\t\\b\b\\f\f()\\1\\23\\0143)',
|
||||||
|
'\n\n\r\r\t\t\b\b\f\f()\001\023\f3')
|
||||||
|
self.decode_bytes(r'(\\\nabc)', '\\\nabc')
|
||||||
|
self.decode_bytes(r'(\ )', ' ')
|
||||||
|
|
||||||
|
def test_BOM_variants(self):
|
||||||
|
self.roundtrip(u'\ufeff', '<FEFFFEFF>')
|
||||||
|
self.roundtrip(u'\ufffe', '<FEFFFFFE>')
|
||||||
|
self.roundtrip(u'\xfe\xff', '<FEFF00FE00FF>')
|
||||||
|
self.roundtrip(u'\xff\xfe', '(\xff\xfe)')
|
||||||
|
self.assertRaises(UnicodeError, PdfString.from_unicode,
|
||||||
|
u'þÿ blah', text_encoding='pdfdocencoding')
|
||||||
|
|
||||||
|
def test_byte_encode(self):
|
||||||
|
self.assertEqual(self.encode(b'ABC'), '(ABC)')
|
||||||
|
|
||||||
|
def test_nullstring(self):
|
||||||
|
self.assertEqual(PdfString('<>').to_bytes(), b'')
|
||||||
|
self.assertEqual(PdfString('()').to_bytes(), b'')
|
||||||
|
|
||||||
|
def main():
|
||||||
|
unittest.main()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
|
@ -0,0 +1,138 @@
|
||||||
|
#! /usr/bin/env python
|
||||||
|
|
||||||
|
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||||
|
# Copyright (C) 2015 Patrick Maupin, Austin, Texas
|
||||||
|
# MIT license -- See LICENSE.txt for details
|
||||||
|
|
||||||
|
'''
|
||||||
|
Run from the directory above like so:
|
||||||
|
|
||||||
|
python -m tests.test_roundtrip
|
||||||
|
|
||||||
|
A PDF that has been determined to be good or bad
|
||||||
|
should be added to expected.txt with either a good
|
||||||
|
checksum, or just the word "fail".
|
||||||
|
|
||||||
|
These tests are incomplete, but they allow us to try
|
||||||
|
out various PDFs. There is a collection of difficult
|
||||||
|
PDFs available on github.
|
||||||
|
|
||||||
|
In order to use them:
|
||||||
|
|
||||||
|
1) Insure that github.com/pmaupin/static_pdfs is on your path.
|
||||||
|
|
||||||
|
2) Use the imagemagick compare program to look at differences
|
||||||
|
between the static_pdfs/global directory and the tmp_results
|
||||||
|
directory after you run this.
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
import os
|
||||||
|
import hashlib
|
||||||
|
import pdfrw
|
||||||
|
import static_pdfs
|
||||||
|
import expected
|
||||||
|
|
||||||
|
from pdfrw.py23_diffs import convert_store
|
||||||
|
|
||||||
|
try:
|
||||||
|
import unittest2 as unittest
|
||||||
|
except ImportError:
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
|
||||||
|
class TestOnePdf(unittest.TestCase):
|
||||||
|
|
||||||
|
def roundtrip(self, testname, basename, srcf, decompress=False,
|
||||||
|
compress=False, repaginate=False):
|
||||||
|
dstd = os.path.join(expected.result_dir, testname)
|
||||||
|
if not os.path.exists(dstd):
|
||||||
|
os.makedirs(dstd)
|
||||||
|
dstf = os.path.join(dstd, basename)
|
||||||
|
hashfile = os.path.join(expected.result_dir, 'hashes.txt')
|
||||||
|
hashkey = '%s/%s' % (testname, basename)
|
||||||
|
hash = '------no-file-generated---------'
|
||||||
|
expects = expected.results[hashkey]
|
||||||
|
|
||||||
|
# If the test has been deliberately skipped,
|
||||||
|
# we are done. Otherwise, execute it even
|
||||||
|
# if we don't know about it yet, so we have
|
||||||
|
# results to compare.
|
||||||
|
|
||||||
|
result = 'fail'
|
||||||
|
size = 0
|
||||||
|
try:
|
||||||
|
if 'skip' in expects:
|
||||||
|
result = 'skip requested'
|
||||||
|
return self.skipTest(result)
|
||||||
|
elif 'xfail' in expects:
|
||||||
|
result = 'xfail requested'
|
||||||
|
return self.fail(result)
|
||||||
|
|
||||||
|
exists = os.path.exists(dstf)
|
||||||
|
if expects or not exists:
|
||||||
|
if exists:
|
||||||
|
os.remove(dstf)
|
||||||
|
trailer = pdfrw.PdfReader(srcf, decompress=decompress,
|
||||||
|
verbose=False)
|
||||||
|
if trailer.Encrypt:
|
||||||
|
result = 'skip -- encrypt'
|
||||||
|
hash = '------skip-encrypt-no-file------'
|
||||||
|
return self.skipTest('File encrypted')
|
||||||
|
writer = pdfrw.PdfWriter(dstf, compress=compress)
|
||||||
|
if repaginate:
|
||||||
|
writer.addpages(trailer.pages)
|
||||||
|
else:
|
||||||
|
writer.trailer = trailer
|
||||||
|
writer.write()
|
||||||
|
with open(dstf, 'rb') as f:
|
||||||
|
data = f.read()
|
||||||
|
size = len(data)
|
||||||
|
if data:
|
||||||
|
hash = hashlib.md5(data).hexdigest()
|
||||||
|
else:
|
||||||
|
os.remove(dstf)
|
||||||
|
if expects:
|
||||||
|
if len(expects) == 1:
|
||||||
|
expects, = expects
|
||||||
|
self.assertEqual(hash, expects)
|
||||||
|
else:
|
||||||
|
self.assertIn(hash, expects)
|
||||||
|
result = 'pass'
|
||||||
|
else:
|
||||||
|
result = 'skip'
|
||||||
|
self.skipTest('No hash available')
|
||||||
|
finally:
|
||||||
|
result = '%8d %-20s %s %s\n' % (size, result, hashkey, hash)
|
||||||
|
with open(hashfile, 'ab') as f:
|
||||||
|
f.write(convert_store(result))
|
||||||
|
|
||||||
|
|
||||||
|
def build_tests():
|
||||||
|
def test_closure(*args, **kw):
|
||||||
|
def test(self):
|
||||||
|
self.roundtrip(*args, **kw)
|
||||||
|
return test
|
||||||
|
for mytest, repaginate, decompress, compress in (
|
||||||
|
('simple', False, False, False),
|
||||||
|
('repaginate', True, False, False),
|
||||||
|
('decompress', False, True, False),
|
||||||
|
('compress', False, True, True),
|
||||||
|
):
|
||||||
|
for srcf in static_pdfs.pdffiles[0]:
|
||||||
|
basename = os.path.basename(srcf)
|
||||||
|
test_name = 'test_%s_%s' % (mytest, basename)
|
||||||
|
test = test_closure(mytest, basename, srcf,
|
||||||
|
repaginate=repaginate,
|
||||||
|
decompress=decompress,
|
||||||
|
compress=compress,
|
||||||
|
)
|
||||||
|
setattr(TestOnePdf, test_name, test)
|
||||||
|
build_tests()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
unittest.main()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
|
@ -0,0 +1,84 @@
|
||||||
|
#! /usr/bin/env python2
|
||||||
|
"""
|
||||||
|
Put old (good) results in ramdisk/reference,
|
||||||
|
then generate new (unknown) test results in ramdisk/tmp_results,
|
||||||
|
THEN SWITCH BACK TO KNOWN GOOD SYSTEM, and finally:
|
||||||
|
|
||||||
|
run this to update any checksums in expected.txt where both versions
|
||||||
|
parse to same PDFs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import hashlib
|
||||||
|
from pdfrw import PdfReader, PdfWriter, PdfArray, PdfDict, PdfObject
|
||||||
|
|
||||||
|
|
||||||
|
def make_canonical(trailer):
|
||||||
|
''' Canonicalizes a PDF. Assumes everything
|
||||||
|
is a Pdf object already.
|
||||||
|
'''
|
||||||
|
visited = set()
|
||||||
|
workitems = list(trailer.values())
|
||||||
|
while workitems:
|
||||||
|
obj = workitems.pop()
|
||||||
|
objid = id(obj)
|
||||||
|
if objid in visited:
|
||||||
|
continue
|
||||||
|
visited.add(objid)
|
||||||
|
obj.indirect = True
|
||||||
|
if isinstance(obj, (PdfArray, PdfDict)):
|
||||||
|
if isinstance(obj, PdfArray):
|
||||||
|
workitems += obj
|
||||||
|
else:
|
||||||
|
workitems += obj.values()
|
||||||
|
return trailer
|
||||||
|
|
||||||
|
with open('expected.txt', 'rb') as f:
|
||||||
|
expected = f.read()
|
||||||
|
|
||||||
|
def get_digest(fname):
|
||||||
|
with open(fname, 'rb') as f:
|
||||||
|
data = f.read()
|
||||||
|
if data:
|
||||||
|
return hashlib.md5(data).hexdigest()
|
||||||
|
|
||||||
|
tmp = '_temp.pdf'
|
||||||
|
count = 0
|
||||||
|
goodcount = 0
|
||||||
|
|
||||||
|
changes = []
|
||||||
|
for (srcpath, _, filenames) in os.walk('ramdisk/reference'):
|
||||||
|
for name in filenames:
|
||||||
|
if not name.endswith('.pdf'):
|
||||||
|
continue
|
||||||
|
src = os.path.join(srcpath, name)
|
||||||
|
dst = src.replace('/reference/', '/tmp_results/')
|
||||||
|
if not os.path.exists(dst):
|
||||||
|
continue
|
||||||
|
src_digest = get_digest(src)
|
||||||
|
if not src_digest or src_digest not in expected:
|
||||||
|
continue
|
||||||
|
print src
|
||||||
|
count += 1
|
||||||
|
trailer = make_canonical(PdfReader(src))
|
||||||
|
out = PdfWriter(tmp)
|
||||||
|
out.write(trailer=trailer)
|
||||||
|
match_digest = get_digest(tmp)
|
||||||
|
if not match_digest:
|
||||||
|
continue
|
||||||
|
trailer = make_canonical(PdfReader(dst))
|
||||||
|
out = PdfWriter(tmp)
|
||||||
|
out.write(trailer=trailer)
|
||||||
|
if get_digest(tmp) != match_digest:
|
||||||
|
continue
|
||||||
|
goodcount += 1
|
||||||
|
print "OK"
|
||||||
|
changes.append((src_digest, get_digest(dst)))
|
||||||
|
|
||||||
|
print count, goodcount
|
||||||
|
|
||||||
|
for stuff in changes:
|
||||||
|
expected = expected.replace(*stuff)
|
||||||
|
|
||||||
|
with open('expected.txt', 'wb') as f:
|
||||||
|
f.write(expected)
|
Loading…
Reference in New Issue