Record pdfrw (0.4-2) in archive suite sid
This commit is contained in:
commit
67e875f025
|
@ -1,3 +1,17 @@
|
|||
# OSX
|
||||
.DS_Store
|
||||
.AppleDouble
|
||||
.LSOverride
|
||||
Icon
|
||||
|
||||
# Thumbnails
|
||||
._*
|
||||
|
||||
# Files that might appear on external disk
|
||||
.Spotlight-V100
|
||||
.Trashes
|
||||
|
||||
|
||||
# Development artifacts
|
||||
diffs.txt
|
||||
examples/*.pdf
|
||||
|
@ -9,6 +23,7 @@ tests/pdfrw
|
|||
tests/static_pdfs
|
||||
tests/ramdisk
|
||||
tests/saved_results
|
||||
tests/tmp_results
|
||||
wiki/
|
||||
|
||||
|
||||
|
|
|
@ -4,6 +4,8 @@ python:
|
|||
- "2.7"
|
||||
- "3.3"
|
||||
- "3.4"
|
||||
- "3.5"
|
||||
- "3.6"
|
||||
- "nightly"
|
||||
# command to install dependencies
|
||||
before_install:
|
||||
|
@ -11,6 +13,7 @@ before_install:
|
|||
install:
|
||||
- "pip install ."
|
||||
- "pip install reportlab || true"
|
||||
- "pip install PyCrypto || true"
|
||||
- "pip install zlib || true"
|
||||
- "pip install unittest2 || true"
|
||||
# command to run tests
|
||||
|
|
14
LICENSE.txt
14
LICENSE.txt
|
@ -8,10 +8,22 @@ Mathieu Fenniak and licensed under the BSD license (also reproduced below).
|
|||
|
||||
Please add any missing authors here:
|
||||
|
||||
Copyright (c) 2006-2015 Patrick Maupin. All rights reserved.
|
||||
Copyright (c) 2006-2017 Patrick Maupin. All rights reserved.
|
||||
Copyright (c) 2006 Mathieu Fenniak. All rights reserved.
|
||||
Copyright (c) 2010 Attila Tajti. All rights reserved.
|
||||
Copyright (c) 2012 Nerijus Mika. All rights reserved.
|
||||
Copyright (c) 2015 Bastien Gandouet. All rights reserved.
|
||||
Copyright (c) 2015 Tzerjen Wei. All rights reserved.
|
||||
Copyright (c) 2015 Jorj X. McKie. All rights reserved.
|
||||
Copyright (c) 2015 Nicholas Devenish. All rights reserved.
|
||||
Copyright (c) 2015-2016 Jonatan Dellagostin. All rights reserved.
|
||||
Copyright (c) 2016-2017 Thomas Kluyver. All rights reserved.
|
||||
Copyright (c) 2016 James Laird-Wah. All rights reserved.
|
||||
Copyright (c) 2016 Marcus Brinkmann. All rights reserved.
|
||||
Copyright (c) 2016 Edward Betts. All rights reserved.
|
||||
Copyright (c) 2016 Patrick Mazulo. All rights reserved.
|
||||
Copyright (c) 2017 Haochen Wu. All rights reserved.
|
||||
Copyright (c) 2017 Jon Lund Steffensen. All rights reserved.
|
||||
|
||||
|
||||
MIT License:
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
include *.txt *.in *.rst
|
||||
recursive-include examples *.txt *.py
|
||||
recursive-include tests *.py
|
||||
|
|
54
README.rst
54
README.rst
|
@ -1,6 +1,6 @@
|
|||
=============
|
||||
pdfrw 0.2b1
|
||||
=============
|
||||
==================
|
||||
pdfrw 0.4
|
||||
==================
|
||||
|
||||
:Author: Patrick Maupin
|
||||
|
||||
|
@ -14,7 +14,7 @@ Introduction
|
|||
|
||||
**pdfrw** is a Python library and utility that reads and writes PDF files:
|
||||
|
||||
* Version 0.2 is tested and works on Python 2.6, 2.7, 3.3, and 3.4.
|
||||
* Version 0.4 is tested and works on Python 2.6, 2.7, 3.3, 3.4, 3.5, and 3.6
|
||||
* Operations include subsetting, merging, rotating, modifying metadata, etc.
|
||||
* The fastest pure Python PDF parser available
|
||||
* Has been used for years by a printer in pre-press production
|
||||
|
@ -74,10 +74,13 @@ try to use pdftk to uncompress and/or unencrypt them first.
|
|||
output.
|
||||
* `rl1/subset.py`__ Another subsetting example, using reportlab canvas for
|
||||
output.
|
||||
* `rl1/platypus_pdf_template.py`__ Aother watermarking example, using
|
||||
* `rl1/platypus_pdf_template.py`__ Another watermarking example, using
|
||||
reportlab canvas and generated output for the document. Contributed
|
||||
by user asannes.
|
||||
* `rl2`__ Experimental code for parsing graphics. Needs work.
|
||||
* `subset_booklets.py`__ shows an example of creating a full printable pdf
|
||||
version in a more professional and pratical way ( take a look at
|
||||
http://www.wikihow.com/Bind-a-Book )
|
||||
|
||||
__ https://github.com/pmaupin/pdfrw/tree/master/examples/4up.py
|
||||
__ https://github.com/pmaupin/pdfrw/tree/master/examples/alter.py
|
||||
|
@ -95,6 +98,7 @@ __ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/booklet.py
|
|||
__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/subset.py
|
||||
__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl1/platypus_pdf_template.py
|
||||
__ https://github.com/pmaupin/pdfrw/tree/master/examples/rl2/
|
||||
__ https://github.com/pmaupin/pdfrw/tree/master/examples/subset_booklets.py
|
||||
|
||||
Notes on selected examples
|
||||
------------------------------------
|
||||
|
@ -715,6 +719,8 @@ non-pure-Python libraries
|
|||
files.
|
||||
- `pycairo <http://www.cairographics.org/pycairo/>`__ can write PDF
|
||||
files.
|
||||
- `PyMuPDF <https://github.com/rk700/PyMuPDF>`_ high performance rendering
|
||||
of PDF, (Open)XPS, CBZ and EPUB
|
||||
|
||||
Other tools
|
||||
-----------
|
||||
|
@ -723,12 +729,50 @@ Other tools
|
|||
line tool for basic PDF manipulation. It complements pdfrw extremely
|
||||
well, supporting many operations such as decryption and decompression
|
||||
that pdfrw cannot do.
|
||||
- `MuPDF <http://www.mupdf.com/>`_ is a free top performance PDF, (Open)XPS, CBZ and EPUB rendering library
|
||||
that also comes with some command line tools. One of those, ``mutool``, has big overlaps with pdftk's -
|
||||
except it is up to 10 times faster.
|
||||
|
||||
Release information
|
||||
=======================
|
||||
|
||||
Revisions:
|
||||
|
||||
0.4 -- Released 18 September, 2017
|
||||
|
||||
- Python 3.6 added to test matrix
|
||||
- Proper unicode support for text strings in PDFs added
|
||||
- buildxobj fixes allow better support creating form XObjects
|
||||
out of compressed pages in some cases
|
||||
- Compression fixes for Python 3+
|
||||
- New subset_booklets.py example
|
||||
- Bug with non-compressed indices into compressed object streams fixed
|
||||
- Bug with distinguishing compressed object stream first objects fixed
|
||||
- Better error reporting added for some invalid PDFs (e.g. when reading
|
||||
past the end of file)
|
||||
- Better scrubbing of old bookmark information when writing PDFs, to
|
||||
remove dangling references
|
||||
- Refactoring of pdfwriter, including updating API, to allow future
|
||||
enhancements for things like incremental writing
|
||||
- Minor tokenizer speedup
|
||||
- Some flate decompressor bugs fixed
|
||||
- Compression and decompression tests added
|
||||
- Tests for new unicode handling added
|
||||
- PdfReader.readpages() recursion error (issue #92) fixed.
|
||||
- Initial crypt filter support added
|
||||
|
||||
|
||||
0.3 -- Released 19 October, 2016.
|
||||
|
||||
- Python 3.5 added to test matrix
|
||||
- Better support under Python 3.x for in-memory PDF file-like objects
|
||||
- Some pagemerge and Unicode patches added
|
||||
- Changes to logging allow better coexistence with other packages
|
||||
- Fix for "from pdfrw import \*"
|
||||
- New fancy_watermark.py example shows off capabilities of pagemerge.py
|
||||
- metadata.py example renamed to cat.py
|
||||
|
||||
|
||||
0.2 -- Released 21 June, 2015. Supports Python 2.6, 2.7, 3.3, and 3.4.
|
||||
|
||||
- Several bugs have been fixed
|
||||
|
|
|
@ -1,3 +1,32 @@
|
|||
pdfrw (0.4-2) unstable; urgency=medium
|
||||
|
||||
* Bumped Standards-Version to 4.1.3
|
||||
* Replaced python-reportlab in python3-pdfrw by python3-reportlab
|
||||
|
||||
-- Rodrigo Siqueira <siqueira@ime.usp.br> Thu, 12 Apr 2018 12:14:12 -0300
|
||||
|
||||
pdfrw (0.4-1) unstable; urgency=medium
|
||||
|
||||
* New upstream version
|
||||
* Added "Multi-Arch: foreign" to python-pdfrw-doc
|
||||
|
||||
[ Lucas Kanashiro ]
|
||||
* Update years of upstream copyright
|
||||
* debian/copyright: use https:// instead of http:// in Format field
|
||||
|
||||
-- Rodrigo Siqueira <siqueira@ime.usp.br> Thu, 21 Sep 2017 09:55:46 -0300
|
||||
|
||||
pdfrw (0.3-1) unstable; urgency=medium
|
||||
|
||||
* New maintainer (Closes: #738298)
|
||||
* New upstream version
|
||||
* Bumped Standards-Version to 4.0.0
|
||||
* Bumped debian/compat to 10
|
||||
* Depend on debhelper >= 10
|
||||
* Added package test with autopkgtests tool
|
||||
|
||||
-- Rodrigo Siqueira <siqueira@ime.usp.br> Wed, 30 Aug 2017 19:18:45 -0300
|
||||
|
||||
pdfrw (0.2-3) unstable; urgency=medium
|
||||
|
||||
* QA upload.
|
||||
|
|
|
@ -1 +1 @@
|
|||
9
|
||||
10
|
||||
|
|
|
@ -1,20 +1,21 @@
|
|||
Source: pdfrw
|
||||
Section: python
|
||||
Priority: optional
|
||||
Maintainer: Debian QA Group <packages@qa.debian.org>
|
||||
Maintainer: Rodrigo Siqueira <siqueira@ime.usp.br>
|
||||
Build-Depends:
|
||||
debhelper (>= 9),
|
||||
debhelper (>= 10),
|
||||
dh-python,
|
||||
python-all (>= 2.6.6-3~),
|
||||
python-setuptools,
|
||||
python3-all,
|
||||
python3-setuptools,
|
||||
Standards-Version: 3.9.8
|
||||
Standards-Version: 4.1.3
|
||||
Homepage: https://github.com/pmaupin/pdfrw
|
||||
Vcs-Git: https://git.dgit.debian.org/pdfrw
|
||||
Vcs-Browser: https://browse.dgit.debian.org/pdfrw.git/
|
||||
X-Python-Version: >= 2.6
|
||||
X-Python3-Version: >= 3.2
|
||||
Testsuite: autopkgtest-pkg-python
|
||||
|
||||
Package: python-pdfrw
|
||||
Architecture: all
|
||||
|
@ -44,6 +45,7 @@ Description: PDF file manipulation library (Python 2)
|
|||
|
||||
Package: python-pdfrw-doc
|
||||
Architecture: all
|
||||
Multi-Arch: foreign
|
||||
Depends:
|
||||
${misc:Depends},
|
||||
Section: doc
|
||||
|
@ -72,7 +74,7 @@ Depends:
|
|||
${python3:Depends},
|
||||
Suggests:
|
||||
python-pdfrw-doc,
|
||||
python-reportlab,
|
||||
python3-reportlab,
|
||||
Description: PDF file manipulation library (Python 3)
|
||||
pdfrw can read and write PDF files, and can also be used to read in PDFs which
|
||||
can then be used inside reportlab.
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
|
||||
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
|
||||
Upstream-Name: pdfrw
|
||||
Upstream-Contact: Patrick Maupin <pmaupin@gmail.com>
|
||||
Source: https://github.com/pmaupin/pdfrw
|
||||
|
||||
Files: *
|
||||
Copyright: © 2006-2015 Patrick Maupin
|
||||
Copyright: © 2006-2017 Patrick Maupin
|
||||
© 2010 Attila Tajti
|
||||
© 2012 Narijus Mika
|
||||
License: Expat
|
||||
|
|
|
@ -27,7 +27,7 @@ def get4(srcpages):
|
|||
inpfn, = sys.argv[1:]
|
||||
outfn = '4up.' + os.path.basename(inpfn)
|
||||
pages = PdfReader(inpfn).pages
|
||||
writer = PdfWriter()
|
||||
writer = PdfWriter(outfn)
|
||||
for index in range(0, len(pages), 4):
|
||||
writer.addpage(get4(pages[index:index + 4]))
|
||||
writer.write(outfn)
|
||||
writer.write()
|
||||
|
|
|
@ -6,7 +6,7 @@ alter.py -- Simple example of making a very slight modification to a PDF.
|
|||
|
||||
booklet.py -- Converts a PDF into a booklet.
|
||||
|
||||
metadata.py -- Concatenates multiple PDFs, adds metadata.
|
||||
cat.py -- Concatenates multiple PDFs, adds metadata.
|
||||
|
||||
poster.py -- Changes the size of a PDF to create a poster
|
||||
|
||||
|
|
|
@ -19,6 +19,4 @@ outfn = 'alter.' + os.path.basename(inpfn)
|
|||
|
||||
trailer = PdfReader(inpfn)
|
||||
trailer.Info.Title = 'My New Title Goes Here'
|
||||
writer = PdfWriter()
|
||||
writer.trailer = trailer
|
||||
writer.write(outfn)
|
||||
PdfWriter(outfn, trailer=trailer).write()
|
||||
|
|
|
@ -1,16 +1,23 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
'''
|
||||
usage: booklet.py my.pdf
|
||||
usage: booklet.py [-p] my.pdf
|
||||
|
||||
Creates booklet.my.pdf
|
||||
|
||||
Pages organized in a form suitable for booklet printing, e.g.
|
||||
to print 4 8.5x11 pages using a single 11x17 sheet (double-sided).
|
||||
|
||||
The output would be using the same type of sheet
|
||||
and you can get up to 3 blank sides if -p is enabled.
|
||||
|
||||
Otherwise the two sides in the middle will be in original page size
|
||||
and you can have 1 blank sides at most.
|
||||
|
||||
'''
|
||||
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
|
||||
from pdfrw import PdfReader, PdfWriter, PageMerge
|
||||
|
||||
|
@ -21,13 +28,23 @@ def fixpage(*pages):
|
|||
return result.render()
|
||||
|
||||
|
||||
inpfn, = sys.argv[1:]
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("input", help="Input pdf file name")
|
||||
parser.add_argument("-p", "--padding", action = "store_true",
|
||||
help="Padding the document so that all pages use the same type of sheet")
|
||||
args = parser.parse_args()
|
||||
|
||||
inpfn = args.input
|
||||
outfn = 'booklet.' + os.path.basename(inpfn)
|
||||
ipages = PdfReader(inpfn).pages
|
||||
|
||||
# Make sure we have an even number
|
||||
if len(ipages) & 1:
|
||||
ipages.append(None)
|
||||
if args.padding:
|
||||
pad_to = 4
|
||||
else:
|
||||
pad_to = 2
|
||||
|
||||
# Make sure we have a correct number of sides
|
||||
ipages += [None]*(-len(ipages)%pad_to)
|
||||
|
||||
opages = []
|
||||
while len(ipages) > 2:
|
||||
|
@ -36,4 +53,4 @@ while len(ipages) > 2:
|
|||
|
||||
opages += ipages
|
||||
|
||||
PdfWriter().addpages(opages).write(outfn)
|
||||
PdfWriter(outfn).addpages(opages).write()
|
||||
|
|
|
@ -22,6 +22,6 @@ outfn = 'extract.' + os.path.basename(inpfn)
|
|||
pages = list(page_per_xobj(PdfReader(inpfn).pages, margin=0.5*72))
|
||||
if not pages:
|
||||
raise IndexError("No XObjects found")
|
||||
writer = PdfWriter()
|
||||
writer = PdfWriter(outfn)
|
||||
writer.addpages(pages)
|
||||
writer.write(outfn)
|
||||
writer.write()
|
||||
|
|
|
@ -0,0 +1,105 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
'''
|
||||
Enhanced example of watermarking using form xobjects (pdfrw).
|
||||
|
||||
usage: fancy_watermark.py [-u] my.pdf single_page.pdf
|
||||
|
||||
Creates watermark.my.pdf, with every page overlaid with
|
||||
first page from single_page.pdf. If -u is selected, watermark
|
||||
will be placed underneath page (painted first).
|
||||
|
||||
The stock watermark.py program assumes all pages are the same
|
||||
size. This example deals with pages of differing sizes in order
|
||||
to show some concepts of positioning and scaling.
|
||||
|
||||
This version applies the watermark such that the upper right
|
||||
corner of the watermark is at the upper right corner of the
|
||||
document page for odd pages, and at the upper left corner
|
||||
of the document page for even pages, for each page of the
|
||||
document.
|
||||
|
||||
It also rescales the size of the watermark if the watermark
|
||||
is too wide for the page.
|
||||
|
||||
These scaling and positioning adjustments can easily
|
||||
be customized for any particular application.
|
||||
|
||||
To handle documents with different page sizes, a cache is
|
||||
maintained of a modified intermediate watermark object
|
||||
for each page size.
|
||||
'''
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
from pdfrw import PdfReader, PdfWriter, PageMerge
|
||||
|
||||
# Get all the filenames
|
||||
|
||||
argv = sys.argv[1:]
|
||||
underneath = '-u' in argv
|
||||
if underneath:
|
||||
del argv[argv.index('-u')]
|
||||
inpfn, wmarkfn = argv
|
||||
outfn = 'watermark.' + os.path.basename(inpfn)
|
||||
|
||||
# Open both the source files
|
||||
wmark_trailer = PdfReader(wmarkfn)
|
||||
trailer = PdfReader(inpfn)
|
||||
|
||||
# Handle different sized pages in same document with
|
||||
# a memoization cache, so we don't create more watermark
|
||||
# objects than we need to (typically only one per document).
|
||||
|
||||
wmark_page = wmark_trailer.pages[0]
|
||||
wmark_cache = {}
|
||||
|
||||
# Process every page
|
||||
for pagenum, page in enumerate(trailer.pages, 1):
|
||||
|
||||
# Get the media box of the page, and see
|
||||
# if we have a matching watermark in the cache
|
||||
mbox = tuple(float(x) for x in page.MediaBox)
|
||||
odd = pagenum & 1
|
||||
key = mbox, odd
|
||||
wmark = wmark_cache.get(key)
|
||||
if wmark is None:
|
||||
|
||||
# Create and cache a new watermark object.
|
||||
wmark = wmark_cache[key] = PageMerge().add(wmark_page)[0]
|
||||
|
||||
# The math is more complete than it probably needs to be,
|
||||
# because the origin of all pages is almost always (0, 0).
|
||||
# Nonetheless, we illustrate all the values and their names.
|
||||
|
||||
page_x, page_y, page_x1, page_y1 = mbox
|
||||
page_w = page_x1 - page_x
|
||||
page_h = page_y1 - page_y # For illustration, not used
|
||||
|
||||
# Scale the watermark if it is too wide for the page
|
||||
# (Could do the same for height instead if needed)
|
||||
if wmark.w > page_w:
|
||||
wmark.scale(1.0 * page_w / wmark.w)
|
||||
|
||||
# Always put watermark at the top of the page
|
||||
# (but see horizontal positioning for other ideas)
|
||||
wmark.y += page_y1 - wmark.h
|
||||
|
||||
# For odd pages, put it at the left of the page,
|
||||
# and for even pages, put it on the right of the page.
|
||||
if odd:
|
||||
wmark.x = page_x
|
||||
else:
|
||||
wmark.x += page_x1 - wmark.w
|
||||
|
||||
# Optimize the case where the watermark is same width
|
||||
# as page.
|
||||
if page_w == wmark.w:
|
||||
wmark_cache[mbox, not odd] = wmark
|
||||
|
||||
# Add the watermark to the page
|
||||
PageMerge(page).add(wmark, prepend=underneath).render()
|
||||
|
||||
# Write out the destination file
|
||||
PdfWriter(outfn, trailer=trailer).write()
|
|
@ -37,7 +37,7 @@ def adjust(page, margin=36, scale=4.8):
|
|||
inpfn, = sys.argv[1:]
|
||||
outfn = 'poster.' + os.path.basename(inpfn)
|
||||
reader = PdfReader(inpfn)
|
||||
writer = PdfWriter()
|
||||
writer = PdfWriter(outfn)
|
||||
writer.addpage(adjust(reader.pages[0]))
|
||||
writer.trailer.Info = IndirectPdfDict(reader.Info or {})
|
||||
writer.write(outfn)
|
||||
writer.write()
|
||||
|
|
|
@ -29,4 +29,4 @@ def fixpage(page, count=[0]):
|
|||
inpfn, = sys.argv[1:]
|
||||
outfn = 'print_two.' + os.path.basename(inpfn)
|
||||
pages = PdfReader(inpfn).pages
|
||||
PdfWriter().addpages(fixpage(x) for x in pages).write(outfn)
|
||||
PdfWriter(outfn).addpages(fixpage(x) for x in pages).write()
|
||||
|
|
|
@ -232,6 +232,19 @@ def parse_text_out(self, token='Tj', params='t'):
|
|||
text = params[0].decode(self.curfont.remap, self.curfont.twobyte)
|
||||
self.tpath.textOut(text)
|
||||
|
||||
def parse_lf_text_out(self, token="'", params='t'):
|
||||
self.tpath.textLine()
|
||||
text = params[0].decode(self.curfont.remap, self.curfont.twobyte)
|
||||
self.tpath.textOut(text)
|
||||
|
||||
|
||||
def parse_lf_text_out_with_spacing(self, token='"', params='fft'):
|
||||
self.tpath.setWordSpace(params[0])
|
||||
self.tpath.setCharSpace(params[1])
|
||||
self.tpath.textLine()
|
||||
text = params[2].decode(self.curfont.remap, self.curfont.twobyte)
|
||||
self.tpath.textOut(text)
|
||||
|
||||
|
||||
def parse_TJ(self, token='TJ', params='a'):
|
||||
remap = self.curfont.remap
|
||||
|
@ -377,7 +390,7 @@ class _ParseClass(object):
|
|||
self.gpath = None
|
||||
self.tpath = None
|
||||
self.fontdict = dict((x, FontInfo(y)) for
|
||||
(x, y) in page.Resources.Font.iteritems())
|
||||
(x, y) in page.Resources.Font.items())
|
||||
|
||||
for token in self.tokens:
|
||||
info = dispatch(token)
|
||||
|
@ -424,7 +437,7 @@ def debugparser(undisturbed=set('parse_array'.split())):
|
|||
myfunc = oldval[0]
|
||||
return myfunc, oldval[1]
|
||||
return dict((x, getvalue(y))
|
||||
for (x, y) in _ParseClass.dispatch.iteritems())
|
||||
for (x, y) in _ParseClass.dispatch.items())
|
||||
|
||||
class _DebugParse(_ParseClass):
|
||||
dispatch = debugdispatch()
|
||||
|
@ -435,10 +448,10 @@ parsepage = _ParseClass.parsepage
|
|||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
from pdfreader import PdfReader
|
||||
from pdfrw import PdfReader
|
||||
parse = debugparser()
|
||||
fname, = sys.argv[1:]
|
||||
pdf = PdfReader(fname)
|
||||
pdf = PdfReader(fname, decompress=True)
|
||||
for i, page in enumerate(pdf.pages):
|
||||
print ('\nPage %s ------------------------------------' % i)
|
||||
parse(page)
|
||||
|
|
|
@ -36,6 +36,6 @@ for onerange in ranges:
|
|||
pages[pagenum].Rotate = (int(pages[pagenum].inheritable.Rotate or
|
||||
0) + rotate) % 360
|
||||
|
||||
outdata = PdfWriter()
|
||||
outdata = PdfWriter(outfn)
|
||||
outdata.trailer = trailer
|
||||
outdata.write(outfn)
|
||||
outdata.write()
|
||||
|
|
|
@ -20,10 +20,10 @@ assert ranges, "Expected at least one range"
|
|||
ranges = ([int(y) for y in x.split('-')] for x in ranges)
|
||||
outfn = 'subset.%s' % os.path.basename(inpfn)
|
||||
pages = PdfReader(inpfn).pages
|
||||
outdata = PdfWriter()
|
||||
outdata = PdfWriter(outfn)
|
||||
|
||||
for onerange in ranges:
|
||||
onerange = (onerange + onerange[-1:])[:2]
|
||||
for pagenum in range(onerange[0], onerange[1]+1):
|
||||
outdata.addpage(pages[pagenum-1])
|
||||
outdata.write(outfn)
|
||||
outdata.write()
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
'''
|
||||
usage: subset_booklets.py my.pdf
|
||||
|
||||
Creates subset_booklets.my.pdf
|
||||
|
||||
Pages organized in a form suitable for booklet printing, e.g.
|
||||
to print 4 8.5x11 pages using a single 11x17 sheet (double-sided).
|
||||
Instead of a large booklet, the pdf is divided into several mini
|
||||
booklets. The reason is: professional printing works this way:
|
||||
- Print all of several mini booklets(subsets of booklet);
|
||||
- Saw each mini booklet individually;
|
||||
- glue them all together;
|
||||
- Insert the cover.
|
||||
|
||||
Take a look at http://www.wikihow.com/Bind-a-Book
|
||||
'''
|
||||
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
from pdfrw import PdfReader, PdfWriter, PageMerge
|
||||
|
||||
BOOKLET_SIZE = 20
|
||||
START = time.time()
|
||||
|
||||
def fixpage(*pages):
|
||||
result = PageMerge() + (x for x in pages if x is not None)
|
||||
result[-1].x += result[0].w
|
||||
return result.render()
|
||||
|
||||
INPFN, = sys.argv[1:]
|
||||
OUTFN = 'booklet.' + os.path.basename(INPFN)
|
||||
ALL_IPAGES = PdfReader(INPFN).pages
|
||||
print 'The pdf file '+str(INPFN)+' has '+str(len(ALL_IPAGES))+' pages.'
|
||||
|
||||
#Make sure we have an even number
|
||||
if len(ALL_IPAGES) & 1:
|
||||
ALL_IPAGES.append(None)
|
||||
print 'Inserting one more blank page to make pages number even.'
|
||||
NUM_OF_ITER, ITERS_LEFT = divmod(len(ALL_IPAGES), BOOKLET_SIZE)
|
||||
|
||||
print 'Making '+str(NUM_OF_ITER)+' subbooklets of '+str(BOOKLET_SIZE)+' pages each.'
|
||||
opages = []
|
||||
for iteration in range(0, NUM_OF_ITER):
|
||||
ipages = ALL_IPAGES[iteration*BOOKLET_SIZE:(iteration+1)*BOOKLET_SIZE]
|
||||
while len(ipages) > 2:
|
||||
opages.append(fixpage(ipages.pop(), ipages.pop(0)))
|
||||
opages.append(fixpage(ipages.pop(0), ipages.pop()))
|
||||
|
||||
# Making one more subbooklet with the left pages
|
||||
ipages = ALL_IPAGES[len(ALL_IPAGES)-ITERS_LEFT:len(ALL_IPAGES)]
|
||||
while len(ipages) > 2:
|
||||
opages.append(fixpage(ipages.pop(), ipages.pop(0)))
|
||||
opages.append(fixpage(ipages.pop(0), ipages.pop()))
|
||||
if len(ipages) >= 1:
|
||||
opages.append(fixpage(ipages.pop(), ipages.pop(0)))
|
||||
|
||||
PdfWriter(OUTFN).addpages(opages).write()
|
||||
print 'It took '+ str(round(time.time()-START, 2))+' seconds to make the pdf subbooklets changes.'
|
|
@ -26,7 +26,7 @@ def splitpage(src):
|
|||
|
||||
inpfn, = sys.argv[1:]
|
||||
outfn = 'unspread.' + os.path.basename(inpfn)
|
||||
writer = PdfWriter()
|
||||
writer = PdfWriter(outfn)
|
||||
for page in PdfReader(inpfn).pages:
|
||||
writer.addpages(splitpage(page))
|
||||
writer.write(outfn)
|
||||
writer.write()
|
||||
|
|
|
@ -9,10 +9,14 @@ Creates watermark.my.pdf, with every page overlaid with
|
|||
first page from single_page.pdf. If -u is selected, watermark
|
||||
will be placed underneath page (painted first).
|
||||
|
||||
NB: At one point, this example was extremely complicated, with
|
||||
multiple options. That only led to errors in implementation,
|
||||
so it has been re-simplified in order to show basic principles
|
||||
of the library operation and to match the other examples better.
|
||||
NOTE 1: This program assumes that all pages (including the watermark
|
||||
page) are the same size. For other possibilities, see
|
||||
the fancy_watermark.py example.
|
||||
|
||||
NOTE 2: At one point, this example was extremely complicated, with
|
||||
multiple options. That only led to errors in implementation,
|
||||
so it has been re-simplified in order to show basic principles
|
||||
of the library operation and to match the other examples better.
|
||||
'''
|
||||
|
||||
import sys
|
||||
|
@ -30,4 +34,4 @@ wmark = PageMerge().add(PdfReader(wmarkfn).pages[0])[0]
|
|||
trailer = PdfReader(inpfn)
|
||||
for page in trailer.pages:
|
||||
PageMerge(page).add(wmark, prepend=underneath).render()
|
||||
PdfWriter().write(outfn, trailer)
|
||||
PdfWriter(outfn, trailer=trailer).write()
|
||||
|
|
|
@ -10,13 +10,14 @@ from .tokens import PdfTokens
|
|||
from .errors import PdfParseError
|
||||
from .pagemerge import PageMerge
|
||||
|
||||
__version__ = '0.2'
|
||||
__version__ = '0.4'
|
||||
|
||||
# Add a tiny bit of compatibility to pyPdf
|
||||
|
||||
PdfFileReader = PdfReader
|
||||
PdfFileWriter = PdfWriter
|
||||
|
||||
__all__ = [PdfWriter, PdfReader, PdfObject, PdfName, PdfArray,
|
||||
PdfTokens, PdfParseError, PdfDict, IndirectPdfDict,
|
||||
PdfString, PageMerge]
|
||||
__all__ = """PdfWriter PdfReader PdfObject PdfName PdfArray
|
||||
PdfTokens PdfParseError PdfDict IndirectPdfDict
|
||||
PdfString PageMerge""".split()
|
||||
|
||||
|
|
|
@ -32,6 +32,8 @@ from .objects import PdfDict, PdfArray, PdfName
|
|||
from .pdfreader import PdfReader
|
||||
from .errors import log, PdfNotImplementedError
|
||||
from .py23_diffs import iteritems
|
||||
from .uncompress import uncompress
|
||||
from .compress import compress
|
||||
|
||||
|
||||
class ViewInfo(object):
|
||||
|
@ -169,6 +171,10 @@ def _build_cache(contents, allow_compressed):
|
|||
and save it along with private cache info.
|
||||
Assumes validity has been pre-checked if
|
||||
we have a non-None xobj_copy.
|
||||
|
||||
Also, the spec says nothing about nested arrays,
|
||||
so we assume those don't exist until we see one
|
||||
in the wild.
|
||||
'''
|
||||
try:
|
||||
xobj_copy = contents.xobj_copy
|
||||
|
@ -183,9 +189,20 @@ def _build_cache(contents, allow_compressed):
|
|||
array = [contents]
|
||||
private = contents.private
|
||||
|
||||
# The spec says nothing about nested arrays. Will
|
||||
# assume that's not a problem until we encounter them...
|
||||
# If we don't allow compressed objects, OR if we have multiple compressed
|
||||
# objects, we try to decompress them, and fail if we cannot do that.
|
||||
|
||||
if not allow_compressed or len(array) > 1:
|
||||
keys = set(x[0] for cdict in array for x in iteritems(cdict))
|
||||
was_compressed = len(keys) > 1
|
||||
if was_compressed:
|
||||
# Make copies of the objects before we uncompress them.
|
||||
array = [PdfDict(x) for x in array]
|
||||
if not uncompress(array):
|
||||
raise PdfNotImplementedError(
|
||||
'Xobjects with these compression parameters not supported: %s' %
|
||||
keys)
|
||||
|
||||
xobj_copy = PdfDict(array[0])
|
||||
xobj_copy.private.xobj_cachedict = {}
|
||||
private.xobj_copy = xobj_copy
|
||||
|
@ -195,19 +212,9 @@ def _build_cache(contents, allow_compressed):
|
|||
newlength = sum(int(x.Length) for x in array) + len(array) - 1
|
||||
assert newlength == len(newstream)
|
||||
xobj_copy.stream = newstream
|
||||
if was_compressed and allow_compressed:
|
||||
compress(xobj_copy)
|
||||
|
||||
# Cannot currently cope with different kinds of
|
||||
# compression in the array, so just disallow it.
|
||||
allow_compressed = False
|
||||
|
||||
if not allow_compressed:
|
||||
# Make sure there are no compression parameters
|
||||
for cdict in array:
|
||||
keys = [x[0] for x in iteritems(cdict)]
|
||||
if len(keys) != 1:
|
||||
raise PdfNotImplementedError(
|
||||
'Xobjects with compression parameters not supported: %s' %
|
||||
keys)
|
||||
return xobj_copy
|
||||
|
||||
|
||||
|
|
|
@ -3,14 +3,14 @@
|
|||
# MIT license -- See LICENSE.txt for details
|
||||
|
||||
'''
|
||||
Currently, this sad little file only knows how to decompress
|
||||
Currently, this sad little file only knows how to compress
|
||||
using the flate (zlib) algorithm. Maybe more later, but it's
|
||||
not a priority for me...
|
||||
'''
|
||||
|
||||
from .objects import PdfName
|
||||
from .uncompress import streamobjects
|
||||
from .py23_diffs import zlib
|
||||
from .py23_diffs import zlib, convert_load, convert_store
|
||||
|
||||
|
||||
def compress(mylist):
|
||||
|
@ -20,7 +20,7 @@ def compress(mylist):
|
|||
if ftype is not None:
|
||||
continue
|
||||
oldstr = obj.stream
|
||||
newstr = zlib.compress(oldstr)
|
||||
newstr = convert_load(zlib.compress(convert_store(oldstr)))
|
||||
if len(newstr) < len(oldstr) + 30:
|
||||
obj.stream = newstr
|
||||
obj.Filter = flate
|
||||
|
|
|
@ -0,0 +1,150 @@
|
|||
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||
# Copyright (C) 2017 Jon Lund Steffensen
|
||||
# MIT license -- See LICENSE.txt for details
|
||||
|
||||
from __future__ import division
|
||||
|
||||
import hashlib
|
||||
import struct
|
||||
|
||||
try:
|
||||
from Crypto.Cipher import ARC4, AES
|
||||
HAS_CRYPTO = True
|
||||
except ImportError:
|
||||
HAS_CRYPTO = False
|
||||
|
||||
from .objects import PdfDict, PdfName
|
||||
|
||||
_PASSWORD_PAD = (
|
||||
'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08'
|
||||
'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz')
|
||||
|
||||
|
||||
def streamobjects(mylist, isinstance=isinstance, PdfDict=PdfDict):
|
||||
for obj in mylist:
|
||||
if isinstance(obj, PdfDict) and obj.stream is not None:
|
||||
yield obj
|
||||
|
||||
|
||||
def create_key(password, doc):
|
||||
"""Create an encryption key (Algorithm 2 in PDF spec)."""
|
||||
key_size = int(doc.Encrypt.Length or 40) // 8
|
||||
padded_pass = (password + _PASSWORD_PAD)[:32]
|
||||
hasher = hashlib.md5()
|
||||
hasher.update(padded_pass)
|
||||
hasher.update(doc.Encrypt.O.to_bytes())
|
||||
hasher.update(struct.pack('<i', int(doc.Encrypt.P)))
|
||||
hasher.update(doc.ID[0].to_bytes())
|
||||
temp_hash = hasher.digest()
|
||||
|
||||
if int(doc.Encrypt.R or 0) >= 3:
|
||||
for _ in range(50):
|
||||
temp_hash = hashlib.md5(temp_hash[:key_size]).digest()
|
||||
|
||||
return temp_hash[:key_size]
|
||||
|
||||
|
||||
def create_user_hash(key, doc):
|
||||
"""Create the user password hash (Algorithm 4/5)."""
|
||||
revision = int(doc.Encrypt.R or 0)
|
||||
if revision < 3:
|
||||
cipher = ARC4.new(key)
|
||||
return cipher.encrypt(_PASSWORD_PAD)
|
||||
else:
|
||||
hasher = hashlib.md5()
|
||||
hasher.update(_PASSWORD_PAD)
|
||||
hasher.update(doc.ID[0].to_bytes())
|
||||
temp_hash = hasher.digest()
|
||||
|
||||
for i in range(20):
|
||||
temp_key = ''.join(chr(i ^ ord(x)) for x in key)
|
||||
cipher = ARC4.new(temp_key)
|
||||
temp_hash = cipher.encrypt(temp_hash)
|
||||
|
||||
return temp_hash
|
||||
|
||||
|
||||
def check_user_password(key, doc):
|
||||
"""Check that the user password is correct (Algorithm 6)."""
|
||||
expect_user_hash = create_user_hash(key, doc)
|
||||
revision = int(doc.Encrypt.R or 0)
|
||||
if revision < 3:
|
||||
return doc.Encrypt.U.to_bytes() == expect_user_hash
|
||||
else:
|
||||
return doc.Encrypt.U.to_bytes()[:16] == expect_user_hash
|
||||
|
||||
|
||||
class AESCryptFilter(object):
|
||||
"""Crypt filter corresponding to /AESV2."""
|
||||
def __init__(self, key):
|
||||
self._key = key
|
||||
|
||||
def decrypt_data(self, num, gen, data):
|
||||
"""Decrypt data (string/stream) using key (Algorithm 1)."""
|
||||
key_extension = struct.pack('<i', num)[:3]
|
||||
key_extension += struct.pack('<i', gen)[:2]
|
||||
key_extension += 'sAlT'
|
||||
temp_key = self._key + key_extension
|
||||
temp_key = hashlib.md5(temp_key).digest()
|
||||
|
||||
iv = data[:AES.block_size]
|
||||
cipher = AES.new(temp_key, AES.MODE_CBC, iv)
|
||||
decrypted = cipher.decrypt(data[AES.block_size:])
|
||||
|
||||
# Remove padding
|
||||
pad_size = ord(decrypted[-1])
|
||||
assert 1 <= pad_size <= 16
|
||||
return decrypted[:-pad_size]
|
||||
|
||||
|
||||
class RC4CryptFilter(object):
|
||||
"""Crypt filter corresponding to /V2."""
|
||||
def __init__(self, key):
|
||||
self._key = key
|
||||
|
||||
def decrypt_data(self, num, gen, data):
|
||||
"""Decrypt data (string/stream) using key (Algorithm 1)."""
|
||||
new_key_size = min(len(self._key) + 5, 16)
|
||||
key_extension = struct.pack('<i', num)[:3]
|
||||
key_extension += struct.pack('<i', gen)[:2]
|
||||
temp_key = self._key + key_extension
|
||||
temp_key = hashlib.md5(temp_key).digest()[:new_key_size]
|
||||
|
||||
cipher = ARC4.new(temp_key)
|
||||
return cipher.decrypt(data)
|
||||
|
||||
|
||||
class IdentityCryptFilter(object):
|
||||
"""Identity crypt filter (pass through with no encryption)."""
|
||||
def decrypt_data(self, num, gen, data):
|
||||
return data
|
||||
|
||||
|
||||
def decrypt_objects(objects, default_filter, filters):
|
||||
"""Decrypt list of stream objects.
|
||||
|
||||
The parameter default_filter specifies the default filter to use. The
|
||||
filters parameter is a dictionary of alternate filters to use when the
|
||||
object specfies an alternate filter locally.
|
||||
"""
|
||||
for obj in streamobjects(objects):
|
||||
if getattr(obj, 'decrypted', False):
|
||||
continue
|
||||
|
||||
filter = default_filter
|
||||
|
||||
# Check whether a locally defined crypt filter should override the
|
||||
# default filter.
|
||||
ftype = obj.Filter
|
||||
if ftype is not None:
|
||||
if not isinstance(ftype, list):
|
||||
ftype = [ftype]
|
||||
if len(ftype) >= 1 and ftype[0] == PdfName.Crypt:
|
||||
ftype = ftype[1:]
|
||||
parms = obj.DecodeParms or obj.DP
|
||||
filter = filters[parms.Name]
|
||||
|
||||
num, gen = obj.indirect
|
||||
obj.stream = filter.decrypt_data(num, gen, obj.stream)
|
||||
obj.private.decrypted = True
|
||||
obj.Filter = ftype or None
|
|
@ -9,11 +9,14 @@ PDF Exceptions and error handling
|
|||
import logging
|
||||
|
||||
|
||||
logging.basicConfig(
|
||||
format='[%(levelname)s] %(filename)s:%(lineno)d %(message)s',
|
||||
level=logging.WARNING)
|
||||
fmt = logging.Formatter('[%(levelname)s] %(filename)s:%(lineno)d %(message)s')
|
||||
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(fmt)
|
||||
|
||||
log = logging.getLogger('pdfrw')
|
||||
log.setLevel(logging.WARNING)
|
||||
log.addHandler(handler)
|
||||
|
||||
|
||||
class PdfError(Exception):
|
||||
|
|
|
@ -8,7 +8,6 @@
|
|||
'''
|
||||
|
||||
from .objects import PdfDict, PdfArray, PdfName
|
||||
from .pdfwriter import user_fmt
|
||||
|
||||
|
||||
def find_objects(source, valid_types=(PdfName.XObject, None),
|
||||
|
@ -81,7 +80,7 @@ def wrap_object(obj, width, margin):
|
|||
iw, ih = float(obj.Width), float(obj.Height)
|
||||
ch = 1.0 * cw / iw * ih
|
||||
height = ch + margin[1] + margin[3]
|
||||
p = tuple(user_fmt(x) for x in (cw, ch, xoffset, yoffset))
|
||||
p = tuple(('%.9f' % x).rstrip('0').rstrip('.') for x in (cw, ch, xoffset, yoffset))
|
||||
contents.stream = fmt % p
|
||||
resources = PdfDict(XObject=PdfDict(MyImage=obj))
|
||||
mbox = PdfArray((0, 0, width, height))
|
||||
|
|
|
@ -15,5 +15,5 @@ from .pdfobject import PdfObject
|
|||
from .pdfstring import PdfString
|
||||
from .pdfindirect import PdfIndirect
|
||||
|
||||
__all__ = [PdfName, PdfDict, IndirectPdfDict, PdfArray,
|
||||
PdfObject, PdfString, PdfIndirect]
|
||||
__all__ = """PdfName PdfDict IndirectPdfDict PdfArray
|
||||
PdfObject PdfString PdfIndirect""".split()
|
||||
|
|
|
@ -65,3 +65,7 @@ class PdfArray(list):
|
|||
def pop(self, *args):
|
||||
self._resolve()
|
||||
return list.pop(self, *args)
|
||||
|
||||
def __reversed__(self):
|
||||
self._resolve()
|
||||
return list.__reversed__(self)
|
||||
|
|
|
@ -136,7 +136,15 @@ class PdfDict(dict):
|
|||
'''
|
||||
value = dictget(self, key)
|
||||
if isinstance(value, PdfIndirect):
|
||||
self[key] = value = value.real_value()
|
||||
# We used to use self[key] here, but that does an
|
||||
# unwanted check on the type of the key (github issue #98).
|
||||
# Python will keep the old key object in the dictionary,
|
||||
# so that check is not necessary.
|
||||
value = value.real_value()
|
||||
if value is not None:
|
||||
dict.__setitem__(self, key, value)
|
||||
else:
|
||||
del self[name]
|
||||
return value
|
||||
|
||||
def __getitem__(self, key):
|
||||
|
|
|
@ -23,6 +23,7 @@ class BasePdfName(str):
|
|||
'''
|
||||
|
||||
indirect = False
|
||||
encoded = None
|
||||
|
||||
whitespace = '\x00 \t\f\r\n'
|
||||
delimiters = '()<>{}[]/%'
|
||||
|
|
|
@ -1,74 +1,553 @@
|
|||
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
|
||||
# Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas
|
||||
# 2016 James Laird-Wah, Sydney, Australia
|
||||
# MIT license -- See LICENSE.txt for details
|
||||
|
||||
import re
|
||||
"""
|
||||
|
||||
================================
|
||||
PdfString encoding and decoding
|
||||
================================
|
||||
|
||||
Introduction
|
||||
=============
|
||||
|
||||
|
||||
This module handles encoding and decoding of PDF strings. PDF strings
|
||||
are described in the PDF 1.7 reference manual, mostly in chapter 3
|
||||
(sections 3.2 and 3.8) and chapter 5.
|
||||
|
||||
PDF strings are used in the document structure itself, and also inside
|
||||
the stream of page contents dictionaries.
|
||||
|
||||
A PDF string can represent pure binary data (e.g. for a font or an
|
||||
image), or text, or glyph indices. For Western fonts, the glyph indices
|
||||
usually correspond to ASCII, but that is not guaranteed. (When it does
|
||||
happen, it makes examination of raw PDF data a lot easier.)
|
||||
|
||||
The specification defines PDF string encoding at two different levels.
|
||||
At the bottom, it defines ways to encode arbitrary bytes so that a PDF
|
||||
tokenizer can understand they are a string of some sort, and can figure
|
||||
out where the string begins and ends. (That is all the tokenizer itself
|
||||
cares about.) Above that level, if the string represents text, the
|
||||
specification defines ways to encode Unicode text into raw bytes, before
|
||||
the byte encoding is performed.
|
||||
|
||||
There are two ways to do the byte encoding, and two ways to do the text
|
||||
(Unicode) encoding.
|
||||
|
||||
Encoding bytes into PDF strings
|
||||
================================
|
||||
|
||||
Adobe calls the two ways to encode bytes into strings "Literal strings"
|
||||
and "Hexadecimal strings."
|
||||
|
||||
Literal strings
|
||||
------------------
|
||||
|
||||
A literal string is delimited by ASCII parentheses ("(" and ")"), and a
|
||||
hexadecimal string is delimited by ASCII less-than and greater-than
|
||||
signs ("<" and ">").
|
||||
|
||||
A literal string may encode bytes almost unmolested. The caveat is
|
||||
that if a byte has the same value as a parenthesis, it must be escaped
|
||||
so that the tokenizer knows the string is not finished. This is accomplished
|
||||
by using the ASCII backslash ("\") as an escape character. Of course,
|
||||
now any backslash appearing in the data must likewise be escaped.
|
||||
|
||||
Hexadecimal strings
|
||||
---------------------
|
||||
|
||||
A hexadecimal string requires twice as much space as the source data
|
||||
it represents (plus two bytes for the delimiter), simply storing each
|
||||
byte as two hexadecimal digits, most significant digit first. The spec
|
||||
allows for lower or upper case hex digits, but most PDF encoders seem
|
||||
to use upper case.
|
||||
|
||||
Special cases -- Legacy systems and readability
|
||||
-----------------------------------------------
|
||||
|
||||
It is possible to create a PDF document that uses 7 bit ASCII encoding,
|
||||
and it is desirable in many cases to create PDFs that are reasonably
|
||||
readable when opened in a text editor. For these reasons, the syntax
|
||||
for both literal strings and hexadecimal strings is slightly more
|
||||
complicated that the initial description above. In general, the additional
|
||||
syntax allows the following features:
|
||||
|
||||
- Making the delineation between characters, or between sections of
|
||||
a string, apparent, and easy to see in an editor.
|
||||
- Keeping output lines from getting too wide for some editors
|
||||
- Keeping output lines from being so narrow that you can only see the
|
||||
small fraction of a string at a time in an editor.
|
||||
- Suppressing unprintable characters
|
||||
- Restricting the output string to 7 bit ASCII
|
||||
|
||||
Hexadecimal readability
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
For hexadecimal strings, only the first two bullets are relevant. The syntax
|
||||
to accomplish this is simple, allowing any ASCII whitespace to be inserted
|
||||
anywhere in the encoded hex string.
|
||||
|
||||
Literal readability
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
For literal strings, all of the bullets except the first are relevant.
|
||||
The syntax has two methods to help with these goals. The first method
|
||||
is to overload the escape operator to be able to do different functions,
|
||||
and the second method can reduce the number of escapes required for
|
||||
parentheses in the normal case.
|
||||
|
||||
The escape function works differently, depending on what byte follows
|
||||
the backslash. In all cases, the escaping backslash is discarded,
|
||||
and then the next character is examined:
|
||||
|
||||
- For parentheses and backslashes (and, in fact, for all characters
|
||||
not described otherwise in this list), the character after the
|
||||
backslash is preserved in the output.
|
||||
- A letter from the set of "nrtbf" following a backslash is interpreted as
|
||||
a line feed, carriage return, tab, backspace, or form-feed, respectively.
|
||||
- One to three octal digits following the backslash indicate the
|
||||
numeric value of the encoded byte.
|
||||
- A carriage return, carriage return/line feed, or line feed following
|
||||
the backslash indicates a line break that was put in for readability,
|
||||
and that is not part of the actual data, so this is discarded.
|
||||
|
||||
The second method that can be used to improve readability (and reduce space)
|
||||
in literal strings is to not escape parentheses. This only works, and is
|
||||
only allowed, when the parentheses are properly balanced. For example,
|
||||
"((Hello))" is a valid encoding for a literal string, but "((Hello)" is not;
|
||||
the latter case should be encoded "(\(Hello)"
|
||||
|
||||
Encoding text into strings
|
||||
==========================
|
||||
|
||||
Section 3.8.1 of the PDF specification describes text strings.
|
||||
|
||||
The individual characters of a text string can all be considered to
|
||||
be Unicode; Adobe specifies two different ways to encode these characters
|
||||
into a string of bytes before further encoding the byte string as a
|
||||
literal string or a hexadecimal string.
|
||||
|
||||
The first way to encode these strings is called PDFDocEncoding. This
|
||||
is mostly a one-for-one mapping of bytes into single bytes, similar to
|
||||
Latin-1. The representable character set is limited to the number of
|
||||
characters that can fit in a byte, and this encoding cannot be used
|
||||
with Unicode strings that start with the two characters making up the
|
||||
UTF-16-BE BOM.
|
||||
|
||||
The second way to encode these strings is with UTF-16-BE. Text strings
|
||||
encoded with this method must start with the BOM, and although the spec
|
||||
does not appear to mandate that the resultant bytes be encoded into a
|
||||
hexadecimal string, that seems to be the canonical way to do it.
|
||||
|
||||
When encoding a string into UTF-16-BE, this module always adds the BOM,
|
||||
and when decoding a string from UTF-16-BE, this module always strips
|
||||
the BOM. If a source string contains a BOM, that will remain in the
|
||||
final string after a round-trip through the encoder and decoder, as
|
||||
the goal of the encoding/decoding process is transparency.
|
||||
|
||||
|
||||
PDF string handling in pdfrw
|
||||
=============================
|
||||
|
||||
Responsibility for handling PDF strings in the pdfrw library is shared
|
||||
between this module, the tokenizer, and the pdfwriter.
|
||||
|
||||
tokenizer string handling
|
||||
--------------------------
|
||||
|
||||
As far as the tokenizer and its clients such as the pdfreader are concerned,
|
||||
the PdfString class must simply be something that it can instantiate by
|
||||
passing a string, that doesn't compare equal (or throw an exception when
|
||||
compared) to other possible token strings. The tokenizer must understand
|
||||
enough about the syntax of the string to successfully find its beginning
|
||||
and end in a stream of tokens, but doesn't otherwise know or care about
|
||||
the data represented by the string.
|
||||
|
||||
pdfwriter string handling
|
||||
--------------------------
|
||||
|
||||
The pdfwriter knows and cares about two attributes of PdfString instances:
|
||||
|
||||
- First, PdfString objects have an 'indirect' attribute, which pdfwriter
|
||||
uses as an indication that the object knows how to represent itself
|
||||
correctly when output to a new PDF. (In the case of a PdfString object,
|
||||
no work is really required, because it is already a string.)
|
||||
- Second, the PdfString.encode() method is used as a convenience to
|
||||
automatically convert any user-supplied strings (that didn't come
|
||||
from PDFs) when a PDF is written out to a file.
|
||||
|
||||
pdfstring handling
|
||||
-------------------
|
||||
|
||||
The code in this module is designed to support those uses by the
|
||||
tokenizer and the pdfwriter, and to additionally support encoding
|
||||
and decoding of PdfString objects as a convenience for the user.
|
||||
|
||||
Most users of the pdfrw library never encode or decode a PdfString,
|
||||
so it is imperative that (a) merely importing this module does not
|
||||
take a significant amount of CPU time; and (b) it is cheap for the
|
||||
tokenizer to produce a PdfString, and cheap for the pdfwriter to
|
||||
consume a PdfString -- if the tokenizer finds a string that conforms
|
||||
to the PDF specification, it will be wrapped in a PdfString object,
|
||||
and if the pdfwriter finds an object with an indirect attribute, it
|
||||
simply calls str() to ask it to format itself.
|
||||
|
||||
Encoding and decoding are not actually performed very often at all,
|
||||
compared to how often tokenization and then subsequent concatenation
|
||||
by the pdfwriter are performed. In fact, versions of pdfrw prior to
|
||||
0.4 did not even support Unicode for this function. Encoding and
|
||||
decoding can also easily be performed by the user, outside of the
|
||||
library, and this might still be recommended, at least for encoding,
|
||||
if the visual appeal of encodings generated by this module is found
|
||||
lacking.
|
||||
|
||||
|
||||
Decoding strings
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Decoding strings can be tricky, but is a bounded process. Each
|
||||
properly-encoded encoded string represents exactly one output string,
|
||||
with the caveat that is up to the caller of the function to know whether
|
||||
he expects a Unicode string, or just bytes.
|
||||
|
||||
The caller can call PdfString.to_bytes() to get a byte string (which may
|
||||
or may not represent encoded Unicode), or may call PdfString.to_unicode()
|
||||
to get a Unicode string. Byte strings will be regular strings in Python 2,
|
||||
and b'' bytes in Python 3; Unicode strings will be regular strings in
|
||||
Python 3, and u'' unicode strings in Python 2.
|
||||
|
||||
To maintain application compatibility with earlier versions of pdfrw,
|
||||
PdfString.decode() is an alias for PdfString.to_unicode().
|
||||
|
||||
Encoding strings
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
PdfString has three factory functions that will encode strings into
|
||||
PdfString objects:
|
||||
|
||||
- PdfString.from_bytes() accepts a byte string (regular string in Python 2
|
||||
or b'' bytes string in Python 3) and returns a PdfString object.
|
||||
- PdfString.from_unicode() accepts a Unicode string (u'' Unicode string in
|
||||
Python 2 or regular string in Python 3) and returns a PdfString object.
|
||||
- PdfString.encode() examines the type of object passed, and either
|
||||
calls from_bytes() or from_unicode() to do the real work.
|
||||
|
||||
Unlike decoding(), encoding is not (mathematically) a function.
|
||||
There are (literally) an infinite number of ways to encode any given
|
||||
source string. (Of course, most of them would be stupid, unless
|
||||
the intent is some sort of denial-of-service attack.)
|
||||
|
||||
So encoding strings is either simpler than decoding, or can be made to
|
||||
be an open-ended science fair project (to create the best looking
|
||||
encoded strings).
|
||||
|
||||
There are parameters to the encoding functions that allow control over
|
||||
the final encoded string, but the intention is to make the default values
|
||||
produce a reasonable encoding.
|
||||
|
||||
As mentioned previously, if encoding does not do what a particular
|
||||
user needs, that user is free to write his own encoder, and then
|
||||
simply instantiate a PdfString object by passing a string to the
|
||||
default constructor, the same way that the tokenizer does it.
|
||||
|
||||
However, if desirable, encoding may gradually become more capable
|
||||
over time, adding the ability to generate more aesthetically pleasing
|
||||
encoded strings.
|
||||
|
||||
PDFDocString encoding and decoding
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
To handle this encoding in a fairly standard way, this module registers
|
||||
an encoder and decoder for PDFDocEncoding with the codecs module.
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
import codecs
|
||||
import binascii
|
||||
import itertools
|
||||
from ..py23_diffs import convert_load, convert_store
|
||||
|
||||
def find_pdfdocencoding(encoding):
|
||||
""" This function conforms to the codec module registration
|
||||
protocol. It defers calculating data structures until
|
||||
a pdfdocencoding encode or decode is required.
|
||||
|
||||
PDFDocEncoding is described in the PDF 1.7 reference manual.
|
||||
"""
|
||||
|
||||
if encoding != 'pdfdocencoding':
|
||||
return
|
||||
|
||||
# Create the decoding map based on the table in section D.2 of the
|
||||
# PDF 1.7 manual
|
||||
|
||||
# Start off with the characters with 1:1 correspondence
|
||||
decoding_map = set(range(0x20, 0x7F)) | set(range(0xA1, 0x100))
|
||||
decoding_map.update((0x09, 0x0A, 0x0D))
|
||||
decoding_map.remove(0xAD)
|
||||
decoding_map = dict((x, x) for x in decoding_map)
|
||||
|
||||
# Add in the special Unicode characters
|
||||
decoding_map.update(zip(range(0x18, 0x20), (
|
||||
0x02D8, 0x02C7, 0x02C6, 0x02D9, 0x02DD, 0x02DB, 0x02DA, 0x02DC)))
|
||||
decoding_map.update(zip(range(0x80, 0x9F), (
|
||||
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
|
||||
0x2039, 0x203A, 0x2212, 0x2030, 0x201E, 0x201C, 0x201D, 0x2018,
|
||||
0x2019, 0x201A, 0x2122, 0xFB01, 0xFB02, 0x0141, 0x0152, 0x0160,
|
||||
0x0178, 0x017D, 0x0131, 0x0142, 0x0153, 0x0161, 0x017E)))
|
||||
decoding_map[0xA0] = 0x20AC
|
||||
|
||||
# Make the encoding map from the decoding map
|
||||
encoding_map = codecs.make_encoding_map(decoding_map)
|
||||
|
||||
# Not every PDF producer follows the spec, so conform to Postel's law
|
||||
# and interpret encoded strings if at all possible. In particular, they
|
||||
# might have nulls and form-feeds, judging by random code snippets
|
||||
# floating around the internet.
|
||||
decoding_map.update(((x, x) for x in range(0x18)))
|
||||
|
||||
def encode(input, errors='strict'):
|
||||
return codecs.charmap_encode(input, errors, encoding_map)
|
||||
|
||||
def decode(input, errors='strict'):
|
||||
return codecs.charmap_decode(input, errors, decoding_map)
|
||||
|
||||
return codecs.CodecInfo(encode, decode, name='pdfdocencoding')
|
||||
|
||||
codecs.register(find_pdfdocencoding)
|
||||
|
||||
class PdfString(str):
|
||||
''' A PdfString is an encoded string. It has a decode
|
||||
""" A PdfString is an encoded string. It has a decode
|
||||
method to get the actual string data out, and there
|
||||
is an encode class method to create such a string.
|
||||
Like any PDF object, it could be indirect, but it
|
||||
defaults to being a direct object.
|
||||
'''
|
||||
"""
|
||||
indirect = False
|
||||
unescape_dict = {'\\b': '\b', '\\f': '\f', '\\n': '\n',
|
||||
'\\r': '\r', '\\t': '\t',
|
||||
'\\\r\n': '', '\\\r': '', '\\\n': '',
|
||||
'\\\\': '\\', '\\': '',
|
||||
}
|
||||
unescape_pattern = (r'(\\\\|\\b|\\f|\\n|\\r|\\t'
|
||||
r'|\\\r\n|\\\r|\\\n|\\[0-9]+|\\)')
|
||||
unescape_func = re.compile(unescape_pattern).split
|
||||
|
||||
hex_pattern = '([a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])'
|
||||
hex_func = re.compile(hex_pattern).split
|
||||
|
||||
hex_pattern2 = ('([a-fA-F0-9][a-fA-F0-9][a-fA-F0-9][a-fA-F0-9]|'
|
||||
'[a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])')
|
||||
hex_func2 = re.compile(hex_pattern2).split
|
||||
# The byte order mark, and unicode that could be
|
||||
# wrongly encoded into the byte order mark by the
|
||||
# pdfdocencoding codec.
|
||||
|
||||
hex_funcs = hex_func, hex_func2
|
||||
bytes_bom = codecs.BOM_UTF16_BE
|
||||
bad_pdfdoc_prefix = bytes_bom.decode('latin-1')
|
||||
|
||||
def decode_regular(self, remap=chr):
|
||||
assert self[0] == '(' and self[-1] == ')'
|
||||
mylist = self.unescape_func(self[1:-1])
|
||||
result = []
|
||||
unescape = self.unescape_dict.get
|
||||
for chunk in mylist:
|
||||
chunk = unescape(chunk, chunk)
|
||||
if chunk.startswith('\\') and len(chunk) > 1:
|
||||
value = int(chunk[1:], 8)
|
||||
# FIXME: TODO: Handle unicode here
|
||||
if value > 127:
|
||||
value = 127
|
||||
chunk = remap(value)
|
||||
if chunk:
|
||||
result.append(chunk)
|
||||
return ''.join(result)
|
||||
# Used by decode_literal; filled in on first use
|
||||
|
||||
def decode_hex(self, remap=chr, twobytes=False):
|
||||
data = ''.join(self.split())
|
||||
data = self.hex_funcs[twobytes](data)
|
||||
chars = data[1::2]
|
||||
other = data[0::2]
|
||||
assert (other[0] == '<' and
|
||||
other[-1] == '>' and
|
||||
''.join(other) == '<>'), self
|
||||
return ''.join([remap(int(x, 16)) for x in chars])
|
||||
unescape_dict = None
|
||||
unescape_func = None
|
||||
|
||||
def decode(self, remap=chr, twobytes=False):
|
||||
if self.startswith('('):
|
||||
return self.decode_regular(remap)
|
||||
@classmethod
|
||||
def init_unescapes(cls):
|
||||
""" Sets up the unescape attributes for decode_literal
|
||||
"""
|
||||
unescape_pattern = r'\\([0-7]{1,3}|\r\n|.)'
|
||||
unescape_func = re.compile(unescape_pattern, re.DOTALL).split
|
||||
cls.unescape_func = unescape_func
|
||||
|
||||
unescape_dict = dict(((chr(x), chr(x)) for x in range(0x100)))
|
||||
unescape_dict.update(zip('nrtbf', '\n\r\t\b\f'))
|
||||
unescape_dict['\r'] = ''
|
||||
unescape_dict['\n'] = ''
|
||||
unescape_dict['\r\n'] = ''
|
||||
for i in range(0o10):
|
||||
unescape_dict['%01o' % i] = chr(i)
|
||||
for i in range(0o100):
|
||||
unescape_dict['%02o' % i] = chr(i)
|
||||
for i in range(0o400):
|
||||
unescape_dict['%03o' % i] = chr(i)
|
||||
cls.unescape_dict = unescape_dict
|
||||
return unescape_func
|
||||
|
||||
def decode_literal(self):
|
||||
""" Decode a PDF literal string, which is enclosed in parentheses ()
|
||||
|
||||
Many pdfrw users never decode strings, so defer creating
|
||||
data structures to do so until the first string is decoded.
|
||||
|
||||
Possible string escapes from the spec:
|
||||
(PDF 1.7 Reference, section 3.2.3, page 53)
|
||||
|
||||
1. \[nrtbf\()]: simple escapes
|
||||
2. \\d{1,3}: octal. Must be zero-padded to 3 digits
|
||||
if followed by digit
|
||||
3. \<end of line>: line continuation. We don't know the EOL
|
||||
marker used in the PDF, so accept \r, \n, and \r\n.
|
||||
4. Any other character following \ escape -- the backslash
|
||||
is swallowed.
|
||||
"""
|
||||
result = (self.unescape_func or self.init_unescapes())(self[1:-1])
|
||||
if len(result) == 1:
|
||||
return convert_store(result[0])
|
||||
unescape_dict = self.unescape_dict
|
||||
result[1::2] = [unescape_dict[x] for x in result[1::2]]
|
||||
return convert_store(''.join(result))
|
||||
|
||||
|
||||
def decode_hex(self):
|
||||
""" Decode a PDF hexadecimal-encoded string, which is enclosed
|
||||
in angle brackets <>.
|
||||
"""
|
||||
hexstr = convert_store(''.join(self[1:-1].split()))
|
||||
if len(hexstr) % 1: # odd number of chars indicates a truncated 0
|
||||
hexstr += '0'
|
||||
return binascii.unhexlify(hexstr)
|
||||
|
||||
|
||||
def to_bytes(self):
|
||||
""" Decode a PDF string to bytes. This is a convenience function
|
||||
for user code, in that (as of pdfrw 0.3) it is never
|
||||
actually used inside pdfrw.
|
||||
"""
|
||||
if self.startswith('(') and self.endswith(')'):
|
||||
return self.decode_literal()
|
||||
|
||||
elif self.startswith('<') and self.endswith('>'):
|
||||
return self.decode_hex()
|
||||
|
||||
else:
|
||||
return self.decode_hex(remap, twobytes)
|
||||
raise ValueError('Invalid PDF string "%s"' % repr(self))
|
||||
|
||||
def encode(cls, source, usehex=False):
|
||||
assert not usehex, "Not supported yet"
|
||||
source = source.replace('\\', '\\\\')
|
||||
source = source.replace('(', '\\(')
|
||||
source = source.replace(')', '\\)')
|
||||
return cls('(' + source + ')')
|
||||
encode = classmethod(encode)
|
||||
def to_unicode(self):
|
||||
""" Decode a PDF string to a unicode string. This is a
|
||||
convenience function for user code, in that (as of
|
||||
pdfrw 0.3) it is never actually used inside pdfrw.
|
||||
|
||||
There are two Unicode storage methods used -- either
|
||||
UTF16_BE, or something called PDFDocEncoding, which
|
||||
is defined in the PDF spec. The determination of
|
||||
which decoding method to use is done by examining the
|
||||
first two bytes for the byte order marker.
|
||||
"""
|
||||
raw = self.to_bytes()
|
||||
|
||||
if raw[:2] == self.bytes_bom:
|
||||
return raw[2:].decode('utf-16-be')
|
||||
else:
|
||||
return raw.decode('pdfdocencoding')
|
||||
|
||||
# Legacy-compatible interface
|
||||
decode = to_unicode
|
||||
|
||||
# Internal value used by encoding
|
||||
|
||||
escape_splitter = None # Calculated on first use
|
||||
|
||||
@classmethod
|
||||
def init_escapes(cls):
|
||||
""" Initialize the escape_splitter for the encode method
|
||||
"""
|
||||
cls.escape_splitter = re.compile(br'(\(|\\|\))').split
|
||||
return cls.escape_splitter
|
||||
|
||||
@classmethod
|
||||
def from_bytes(cls, raw, bytes_encoding='auto'):
|
||||
""" The from_bytes() constructor is called to encode a source raw
|
||||
byte string into a PdfString that is suitable for inclusion
|
||||
in a PDF.
|
||||
|
||||
NOTE: There is no magic in the encoding process. A user
|
||||
can certainly do his own encoding, and simply initialize a
|
||||
PdfString() instance with his encoded string. That may be
|
||||
useful, for example, to add line breaks to make it easier
|
||||
to load PDFs into editors, or to not bother to escape balanced
|
||||
parentheses, or to escape additional characters to make a PDF
|
||||
more readable in a file editor. Those are features not
|
||||
currently supported by this method.
|
||||
|
||||
from_bytes() can use a heuristic to figure out the best
|
||||
encoding for the string, or the user can control the process
|
||||
by changing the bytes_encoding parameter to 'literal' or 'hex'
|
||||
to force a particular conversion method.
|
||||
"""
|
||||
|
||||
# If hexadecimal is not being forced, then figure out how long
|
||||
# the escaped literal string will be, and fall back to hex if
|
||||
# it is too long.
|
||||
|
||||
force_hex = bytes_encoding == 'hex'
|
||||
if not force_hex:
|
||||
if bytes_encoding not in ('literal', 'auto'):
|
||||
raise ValueError('Invalid bytes_encoding value: %s'
|
||||
% bytes_encoding)
|
||||
splitlist = (cls.escape_splitter or cls.init_escapes())(raw)
|
||||
if bytes_encoding == 'auto' and len(splitlist) // 2 >= len(raw):
|
||||
force_hex = True
|
||||
|
||||
if force_hex:
|
||||
# The spec does not mandate uppercase,
|
||||
# but it seems to be the convention.
|
||||
fmt = '<%s>'
|
||||
result = binascii.hexlify(raw).upper()
|
||||
else:
|
||||
fmt = '(%s)'
|
||||
splitlist[1::2] = [(b'\\' + x) for x in splitlist[1::2]]
|
||||
result = b''.join(splitlist)
|
||||
|
||||
return cls(fmt % convert_load(result))
|
||||
|
||||
@classmethod
|
||||
def from_unicode(cls, source, text_encoding='auto',
|
||||
bytes_encoding='auto'):
|
||||
""" The from_unicode() constructor is called to encode a source
|
||||
string into a PdfString that is suitable for inclusion in a PDF.
|
||||
|
||||
NOTE: There is no magic in the encoding process. A user
|
||||
can certainly do his own encoding, and simply initialize a
|
||||
PdfString() instance with his encoded string. That may be
|
||||
useful, for example, to add line breaks to make it easier
|
||||
to load PDFs into editors, or to not bother to escape balanced
|
||||
parentheses, or to escape additional characters to make a PDF
|
||||
more readable in a file editor. Those are features not
|
||||
supported by this method.
|
||||
|
||||
from_unicode() can use a heuristic to figure out the best
|
||||
encoding for the string, or the user can control the process
|
||||
by changing the text_encoding parameter to 'pdfdocencoding'
|
||||
or 'utf16', and/or by changing the bytes_encoding parameter
|
||||
to 'literal' or 'hex' to force particular conversion methods.
|
||||
|
||||
The function will raise an exception if it cannot perform
|
||||
the conversion as requested by the user.
|
||||
"""
|
||||
|
||||
# Give preference to pdfdocencoding, since it only
|
||||
# requires one raw byte per character, rather than two.
|
||||
if text_encoding != 'utf16':
|
||||
force_pdfdoc = text_encoding == 'pdfdocencoding'
|
||||
if text_encoding != 'auto' and not force_pdfdoc:
|
||||
raise ValueError('Invalid text_encoding value: %s'
|
||||
% text_encoding)
|
||||
|
||||
if source.startswith(cls.bad_pdfdoc_prefix):
|
||||
if force_pdfdoc:
|
||||
raise UnicodeError('Prefix of string %r cannot be encoded '
|
||||
'in pdfdocencoding' % source[:20])
|
||||
else:
|
||||
try:
|
||||
raw = source.encode('pdfdocencoding')
|
||||
except UnicodeError:
|
||||
if force_pdfdoc:
|
||||
raise
|
||||
else:
|
||||
return cls.from_bytes(raw, bytes_encoding)
|
||||
|
||||
# If the user is not forcing literal strings,
|
||||
# it makes much more sense to use hexadecimal with 2-byte chars
|
||||
raw = cls.bytes_bom + source.encode('utf-16-be')
|
||||
encoding = 'hex' if bytes_encoding == 'auto' else bytes_encoding
|
||||
return cls.from_bytes(raw, encoding)
|
||||
|
||||
@classmethod
|
||||
def encode(cls, source, uni_type = type(u''), isinstance=isinstance):
|
||||
""" The encode() constructor is a legacy function that is
|
||||
also a convenience for the PdfWriter.
|
||||
"""
|
||||
if isinstance(source, uni_type):
|
||||
return cls.from_unicode(source)
|
||||
else:
|
||||
return cls.from_bytes(source)
|
||||
|
|
|
@ -176,8 +176,8 @@ class PageMerge(list):
|
|||
return self
|
||||
|
||||
def render(self):
|
||||
def do_xobjs(xobj_list):
|
||||
content = []
|
||||
def do_xobjs(xobj_list, restore_first=False):
|
||||
content = ['Q'] if restore_first else []
|
||||
for obj in xobj_list:
|
||||
index = PdfName('pdfrw_%d' % (key_offset + len(xobjs)))
|
||||
if xobjs.setdefault(index, obj) is not obj:
|
||||
|
@ -199,9 +199,9 @@ class PageMerge(list):
|
|||
allkeys = xobjs.keys()
|
||||
if allkeys:
|
||||
keys = (x for x in allkeys if x.startswith('/pdfrw_'))
|
||||
keys = (x for x in keys if x[6:].isdigit())
|
||||
keys = sorted(keys, key=lambda x: int(x[6:]))
|
||||
key_offset = (int(keys[-1][6:]) + 1) if keys else 0
|
||||
keys = (x for x in keys if x[7:].isdigit())
|
||||
keys = sorted(keys, key=lambda x: int(x[7:]))
|
||||
key_offset = (int(keys[-1][7:]) + 1) if keys else 0
|
||||
key_offset -= len(allkeys)
|
||||
|
||||
if old_contents is None:
|
||||
|
@ -213,10 +213,18 @@ class PageMerge(list):
|
|||
index = self.index(None)
|
||||
if index:
|
||||
new_contents.append(do_xobjs(self[:index]))
|
||||
new_contents.extend(old_contents)
|
||||
|
||||
index += 1
|
||||
if index < len(self):
|
||||
new_contents.append(do_xobjs(self[index:]))
|
||||
# There are elements to add after the original page contents,
|
||||
# so push the graphics state to the stack. Restored below.
|
||||
new_contents.append(PdfDict(indirect=True, stream='q'))
|
||||
|
||||
new_contents.extend(old_contents)
|
||||
|
||||
if index < len(self):
|
||||
# Restore graphics state and add other elements.
|
||||
new_contents.append(do_xobjs(self[index:], restore_first=True))
|
||||
|
||||
if mbox is None:
|
||||
cbox = None
|
||||
|
|
|
@ -19,7 +19,8 @@ from .errors import PdfParseError, log
|
|||
from .tokens import PdfTokens
|
||||
from .objects import PdfDict, PdfArray, PdfName, PdfObject, PdfIndirect
|
||||
from .uncompress import uncompress
|
||||
from .py23_diffs import convert_load, iteritems
|
||||
from . import crypt
|
||||
from .py23_diffs import convert_load, convert_store, iteritems
|
||||
|
||||
|
||||
class PdfReader(PdfDict):
|
||||
|
@ -265,8 +266,17 @@ class PdfReader(PdfDict):
|
|||
for key in new:
|
||||
self.loadindirect(key)
|
||||
|
||||
def decrypt_all(self):
|
||||
self.read_all()
|
||||
|
||||
if self.crypt_filters is not None:
|
||||
crypt.decrypt_objects(
|
||||
self.indirect_objects.values(), self.stream_crypt_filter,
|
||||
self.crypt_filters)
|
||||
|
||||
def uncompress(self):
|
||||
self.read_all()
|
||||
|
||||
uncompress(self.indirect_objects.values())
|
||||
|
||||
def load_stream_objects(self, object_streams):
|
||||
|
@ -279,22 +289,26 @@ class PdfReader(PdfDict):
|
|||
|
||||
# read objects from stream
|
||||
if objs:
|
||||
# Decrypt
|
||||
if self.crypt_filters is not None:
|
||||
crypt.decrypt_objects(
|
||||
objs, self.stream_crypt_filter, self.crypt_filters)
|
||||
|
||||
# Decompress
|
||||
uncompress(objs)
|
||||
|
||||
for obj in objs:
|
||||
objsource = PdfTokens(obj.stream, 0, False)
|
||||
snext = objsource.next
|
||||
offsets = {}
|
||||
next = objsource.next
|
||||
offsets = []
|
||||
firstoffset = int(obj.First)
|
||||
num = snext()
|
||||
while num.isdigit():
|
||||
offset = int(snext())
|
||||
offsets[int(num)] = firstoffset + offset
|
||||
num = snext()
|
||||
for num, offset in iteritems(offsets):
|
||||
while objsource.floc < firstoffset:
|
||||
offsets.append((int(next()), firstoffset + int(next())))
|
||||
for num, offset in offsets:
|
||||
# Read the object, and call special code if it starts
|
||||
# an array or dictionary
|
||||
objsource.floc = offset
|
||||
sobj = snext()
|
||||
sobj = next()
|
||||
func = self.special.get(sobj)
|
||||
if func is not None:
|
||||
sobj = func(objsource)
|
||||
|
@ -332,7 +346,6 @@ class PdfReader(PdfDict):
|
|||
'''
|
||||
|
||||
def readint(s, lengths):
|
||||
lengths = itertools.cycle(lengths)
|
||||
offset = 0
|
||||
for length in itertools.cycle(lengths):
|
||||
next = offset + length
|
||||
|
@ -354,8 +367,13 @@ class PdfReader(PdfDict):
|
|||
source.exception('Expected dict type of /XRef')
|
||||
tok = next()
|
||||
self.readstream(obj, self.findstream(obj, tok, source), source, True)
|
||||
old_strm = obj.stream
|
||||
if not uncompress([obj], True):
|
||||
source.exception('Could not decompress Xref stream')
|
||||
stream = obj.stream
|
||||
# Fix for issue #76 -- goofy compressed xref stream
|
||||
# that is NOT ACTUALLY COMPRESSED
|
||||
stream = stream if stream is not old_strm else convert_store(old_strm)
|
||||
num_pairs = obj.Index or PdfArray(['0', obj.Size])
|
||||
num_pairs = [int(x) for x in num_pairs]
|
||||
num_pairs = zip(num_pairs[0::2], num_pairs[1::2])
|
||||
|
@ -363,7 +381,7 @@ class PdfReader(PdfDict):
|
|||
if len(entry_sizes) != 3:
|
||||
source.exception('Invalid entry size')
|
||||
object_streams = defaultdict(list)
|
||||
get = readint(obj.stream, entry_sizes)
|
||||
get = readint(stream, entry_sizes)
|
||||
for objnum, size in num_pairs:
|
||||
for cnt in range(size):
|
||||
xtype, p1, p2 = islice(get, 3)
|
||||
|
@ -431,7 +449,10 @@ class PdfReader(PdfDict):
|
|||
''' Parse (one of) the cross-reference file section(s)
|
||||
'''
|
||||
next = source.next
|
||||
tok = next()
|
||||
try:
|
||||
tok = next()
|
||||
except StopIteration:
|
||||
tok = ''
|
||||
if tok.isdigit():
|
||||
return self.parse_xref_stream(source), True
|
||||
elif tok == 'xref':
|
||||
|
@ -450,36 +471,92 @@ class PdfReader(PdfDict):
|
|||
typename = PdfName.Type
|
||||
kidname = PdfName.Kids
|
||||
|
||||
# PDFs can have arbitrarily nested Pages/Page
|
||||
# dictionary structures.
|
||||
def readnode(node):
|
||||
nodetype = node[typename]
|
||||
if nodetype == pagename:
|
||||
yield node
|
||||
elif nodetype == pagesname:
|
||||
for node in node[kidname]:
|
||||
for node in readnode(node):
|
||||
yield node
|
||||
elif nodetype == catalogname:
|
||||
for node in readnode(node[pagesname]):
|
||||
yield node
|
||||
else:
|
||||
log.error('Expected /Page or /Pages dictionary, got %s' %
|
||||
repr(node))
|
||||
try:
|
||||
return list(readnode(node))
|
||||
result = []
|
||||
stack = [node]
|
||||
append = result.append
|
||||
pop = stack.pop
|
||||
while stack:
|
||||
node = pop()
|
||||
nodetype = node[typename]
|
||||
if nodetype == pagename:
|
||||
append(node)
|
||||
elif nodetype == pagesname:
|
||||
stack.extend(reversed(node[kidname]))
|
||||
elif nodetype == catalogname:
|
||||
stack.append(node[pagesname])
|
||||
else:
|
||||
log.error('Expected /Page or /Pages dictionary, got %s' %
|
||||
repr(node))
|
||||
return result
|
||||
except (AttributeError, TypeError) as s:
|
||||
log.error('Invalid page tree: %s' % s)
|
||||
return []
|
||||
|
||||
def __init__(self, fname=None, fdata=None, decompress=False,
|
||||
disable_gc=True, verbose=True):
|
||||
def _parse_encrypt_info(self, source, password, trailer):
|
||||
"""Check password and initialize crypt filters."""
|
||||
# Create and check password key
|
||||
key = crypt.create_key(password, trailer)
|
||||
|
||||
if not crypt.check_user_password(key, trailer):
|
||||
source.warning('User password does not validate')
|
||||
|
||||
# Create default crypt filters
|
||||
private = self.private
|
||||
crypt_filters = self.crypt_filters
|
||||
version = int(trailer.Encrypt.V or 0)
|
||||
if version in (1, 2):
|
||||
crypt_filter = crypt.RC4CryptFilter(key)
|
||||
private.stream_crypt_filter = crypt_filter
|
||||
private.string_crypt_filter = crypt_filter
|
||||
elif version == 4:
|
||||
if PdfName.CF in trailer.Encrypt:
|
||||
for name, params in iteritems(trailer.Encrypt.CF):
|
||||
if name == PdfName.Identity:
|
||||
continue
|
||||
|
||||
cfm = params.CFM
|
||||
if cfm == PdfName.AESV2:
|
||||
crypt_filters[name] = crypt.AESCryptFilter(key)
|
||||
elif cfm == PdfName.V2:
|
||||
crypt_filters[name] = crypt.RC4CryptFilter(key)
|
||||
else:
|
||||
source.warning(
|
||||
'Unsupported crypt filter: {}, {}'.format(
|
||||
name, cfm))
|
||||
|
||||
# Read default stream filter
|
||||
if PdfName.StmF in trailer.Encrypt:
|
||||
name = trailer.Encrypt.StmF
|
||||
if name in crypt_filters:
|
||||
private.stream_crypt_filter = crypt_filters[name]
|
||||
else:
|
||||
source.warning(
|
||||
'Invalid crypt filter name in /StmF:'
|
||||
' {}'.format(name))
|
||||
|
||||
# Read default string filter
|
||||
if PdfName.StrF in trailer.Encrypt:
|
||||
name = trailer.Encrypt.StrF
|
||||
if name in crypt_filters:
|
||||
private.string_crypt_filter = crypt_filters[name]
|
||||
else:
|
||||
source.warning(
|
||||
'Invalid crypt filter name in /StrF:'
|
||||
' {}'.format(name))
|
||||
else:
|
||||
source.warning(
|
||||
'Unsupported Encrypt version: {}'.format(version))
|
||||
|
||||
def __init__(self, fname=None, fdata=None, decompress=False,
|
||||
decrypt=False, password='', disable_gc=True, verbose=True):
|
||||
self.private.verbose = verbose
|
||||
|
||||
# Runs a lot faster with GC off.
|
||||
disable_gc = disable_gc and gc.isenabled()
|
||||
if disable_gc:
|
||||
gc.disable()
|
||||
|
||||
try:
|
||||
if fname is not None:
|
||||
assert fdata is None
|
||||
|
@ -494,8 +571,10 @@ class PdfReader(PdfDict):
|
|||
except IOError:
|
||||
raise PdfParseError('Could not read PDF file %s' %
|
||||
fname)
|
||||
fdata = convert_load(fdata)
|
||||
|
||||
assert fdata is not None
|
||||
fdata = convert_load(fdata)
|
||||
|
||||
if not fdata.startswith('%PDF-'):
|
||||
startloc = fdata.find('%PDF-')
|
||||
if startloc >= 0:
|
||||
|
@ -548,6 +627,23 @@ class PdfReader(PdfDict):
|
|||
xref_list.append((source.obj_offsets, trailer, is_stream))
|
||||
source.floc = int(prev)
|
||||
|
||||
# Handle document encryption
|
||||
private.crypt_filters = None
|
||||
if decrypt and PdfName.Encrypt in trailer:
|
||||
identity_filter = crypt.IdentityCryptFilter()
|
||||
crypt_filters = {
|
||||
PdfName.Identity: identity_filter
|
||||
}
|
||||
private.crypt_filters = crypt_filters
|
||||
private.stream_crypt_filter = identity_filter
|
||||
private.string_crypt_filter = identity_filter
|
||||
|
||||
if not crypt.HAS_CRYPTO:
|
||||
raise PdfParseError(
|
||||
'Install PyCrypto to enable encryption support')
|
||||
|
||||
self._parse_encrypt_info(source, password, trailer)
|
||||
|
||||
if is_stream:
|
||||
self.load_stream_objects(trailer.object_streams)
|
||||
|
||||
|
@ -566,6 +662,10 @@ class PdfReader(PdfDict):
|
|||
float(trailer.Version) > float(self.version)):
|
||||
self.private.version = trailer.Version
|
||||
|
||||
if decrypt:
|
||||
self.decrypt_all()
|
||||
trailer.Encrypt = None
|
||||
|
||||
if is_stream:
|
||||
self.Root = trailer.Root
|
||||
self.Info = trailer.Info
|
||||
|
|
|
@ -29,7 +29,7 @@ NullObject.Type = 'Null object'
|
|||
|
||||
|
||||
def user_fmt(obj, isinstance=isinstance, float=float, str=str,
|
||||
basestring=str, encode=PdfString.encode):
|
||||
basestring=(type(u''), type(b'')), encode=PdfString.encode):
|
||||
''' This function may be replaced by the user for
|
||||
specialized formatting requirements.
|
||||
'''
|
||||
|
@ -137,11 +137,11 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
|
|||
elif isinstance(obj, PdfDict):
|
||||
if compress and obj.stream:
|
||||
do_compress([obj])
|
||||
pairs = sorted((x, y, getattr(x, 'encoded', x))
|
||||
pairs = sorted((getattr(x, 'encoded', None) or x, y)
|
||||
for (x, y) in obj.iteritems())
|
||||
myarray = []
|
||||
for key, value, encoding in pairs:
|
||||
myarray.append(encoding)
|
||||
for key, value in pairs:
|
||||
myarray.append(key)
|
||||
myarray.append(add(value))
|
||||
result = format_array(myarray, '<<%s>>')
|
||||
stream = obj.stream
|
||||
|
@ -155,7 +155,7 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
|
|||
# We assume that an object with an indirect
|
||||
# attribute knows how to represent itself to us.
|
||||
if hasattr(obj, 'indirect'):
|
||||
return str(getattr(obj, 'encoded', obj))
|
||||
return str(getattr(obj, 'encoded', None) or obj)
|
||||
return user_fmt(obj)
|
||||
|
||||
def format_deferred():
|
||||
|
@ -177,10 +177,10 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
|
|||
|
||||
# Don't reference old catalog or pages objects --
|
||||
# swap references to new ones.
|
||||
swapobj = {PdfName.Catalog: trailer.Root,
|
||||
type_remap = {PdfName.Catalog: trailer.Root,
|
||||
PdfName.Pages: trailer.Root.Pages, None: trailer}.get
|
||||
swapobj = [(objid, swapobj(obj.Type))
|
||||
for objid, obj in iteritems(killobj)]
|
||||
swapobj = [(objid, type_remap(obj.Type) if new_obj is None else new_obj)
|
||||
for objid, (obj, new_obj) in iteritems(killobj)]
|
||||
swapobj = dict((objid, obj is None and NullObject or obj)
|
||||
for objid, obj in swapobj).get
|
||||
|
||||
|
@ -225,11 +225,44 @@ class PdfWriter(object):
|
|||
|
||||
_trailer = None
|
||||
canonicalize = False
|
||||
fname = None
|
||||
|
||||
def __init__(self, version='1.3', compress=False):
|
||||
self.pagearray = PdfArray()
|
||||
self.compress = compress
|
||||
def __init__(self, fname=None, version='1.3', compress=False, **kwargs):
|
||||
"""
|
||||
Parameters:
|
||||
fname -- Output file name, or file-like binary object
|
||||
with a write method
|
||||
version -- PDF version to target. Currently only 1.3
|
||||
supported.
|
||||
compress -- True to do compression on output. Currently
|
||||
compresses stream objects.
|
||||
"""
|
||||
|
||||
# Legacy support: fname is new, was added in front
|
||||
if fname is not None:
|
||||
try:
|
||||
float(fname)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
else:
|
||||
if version != '1.3':
|
||||
assert compress == False
|
||||
compress = version
|
||||
version = fname
|
||||
fname = None
|
||||
|
||||
self.fname = fname
|
||||
self.version = version
|
||||
self.compress = compress
|
||||
|
||||
if kwargs:
|
||||
for name, value in iteritems(kwargs):
|
||||
if name not in self.replaceable:
|
||||
raise ValueError("Cannot set attribute %s "
|
||||
"on PdfWriter instance" % name)
|
||||
setattr(self, name, value)
|
||||
|
||||
self.pagearray = PdfArray()
|
||||
self.killobj = {}
|
||||
|
||||
def addpage(self, page):
|
||||
|
@ -251,13 +284,14 @@ class PdfWriter(object):
|
|||
# Add parents in the hierarchy to objects we
|
||||
# don't want to output
|
||||
killobj = self.killobj
|
||||
obj = page.Parent
|
||||
obj, new_obj = page, self.pagearray[-1]
|
||||
while obj is not None:
|
||||
objid = id(obj)
|
||||
if objid in killobj:
|
||||
break
|
||||
killobj[objid] = obj
|
||||
killobj[objid] = obj, new_obj
|
||||
obj = obj.Parent
|
||||
new_obj = None
|
||||
return self
|
||||
|
||||
addPage = addpage # for compatibility with pyPdf
|
||||
|
@ -300,10 +334,18 @@ class PdfWriter(object):
|
|||
|
||||
trailer = property(_get_trailer, _set_trailer)
|
||||
|
||||
def write(self, fname, trailer=None, user_fmt=user_fmt,
|
||||
def write(self, fname=None, trailer=None, user_fmt=user_fmt,
|
||||
disable_gc=True):
|
||||
|
||||
trailer = trailer or self.trailer
|
||||
|
||||
# Support fname for legacy applications
|
||||
if (fname is not None) == (self.fname is not None):
|
||||
raise PdfOutputError(
|
||||
"PdfWriter fname must be specified exactly once")
|
||||
|
||||
fname = fname or self.fname
|
||||
|
||||
# Dump the data. We either have a filename or a preexisting
|
||||
# file object.
|
||||
preexisting = hasattr(fname, 'write')
|
||||
|
@ -339,3 +381,5 @@ class PdfWriter(object):
|
|||
workitems += obj
|
||||
else:
|
||||
workitems += obj.values()
|
||||
|
||||
replaceable = set(vars())
|
|
@ -14,7 +14,9 @@ try:
|
|||
except NameError:
|
||||
|
||||
def convert_load(s):
|
||||
return s.decode('Latin-1')
|
||||
if isinstance(s, bytes):
|
||||
return s.decode('Latin-1')
|
||||
return s
|
||||
|
||||
def convert_store(s):
|
||||
return s.encode('Latin-1')
|
||||
|
@ -44,3 +46,8 @@ try:
|
|||
xrange = xrange
|
||||
except NameError:
|
||||
xrange = range
|
||||
|
||||
try:
|
||||
intern = intern
|
||||
except NameError:
|
||||
from sys import intern
|
||||
|
|
|
@ -15,7 +15,7 @@ import itertools
|
|||
from .objects import PdfString, PdfObject
|
||||
from .objects.pdfname import BasePdfName
|
||||
from .errors import log, PdfParseError
|
||||
from .py23_diffs import nextattr
|
||||
from .py23_diffs import nextattr, intern
|
||||
|
||||
|
||||
def linepos(fdata, loc):
|
||||
|
@ -64,19 +64,7 @@ class PdfTokens(object):
|
|||
findparen = re.compile('(%s)[%s]*' % (p_literal_string_extend,
|
||||
whitespace), re.DOTALL).finditer
|
||||
|
||||
def _cacheobj(cache, obj, constructor):
|
||||
''' This caching relies on the constructors
|
||||
returning something that will compare as
|
||||
equal to the original obj. This works
|
||||
fine with our PDF objects.
|
||||
'''
|
||||
result = cache.get(obj)
|
||||
if result is None:
|
||||
result = constructor(obj)
|
||||
cache[result] = result
|
||||
return result
|
||||
|
||||
def _gettoks(self, startloc, cacheobj=_cacheobj,
|
||||
def _gettoks(self, startloc, intern=intern,
|
||||
delimiters=delimiters, findtok=findtok,
|
||||
findparen=findparen, PdfString=PdfString,
|
||||
PdfObject=PdfObject, BasePdfName=BasePdfName):
|
||||
|
@ -95,24 +83,23 @@ class PdfTokens(object):
|
|||
fdata = self.fdata
|
||||
current = self.current = [(startloc, startloc)]
|
||||
cache = {}
|
||||
get_cache = cache.get
|
||||
while 1:
|
||||
for match in findtok(fdata, current[0][1]):
|
||||
current[0] = tokspan = match.span()
|
||||
token = match.group(1)
|
||||
firstch = token[0]
|
||||
toktype = intern
|
||||
if firstch not in delimiters:
|
||||
token = cacheobj(cache, token, PdfObject)
|
||||
toktype = PdfObject
|
||||
elif firstch in '/<(%':
|
||||
if firstch == '/':
|
||||
# PDF Name
|
||||
encoded = token
|
||||
token = cache.get(encoded)
|
||||
if token is None:
|
||||
token = cache[token] = BasePdfName(encoded)
|
||||
toktype = BasePdfName
|
||||
elif firstch == '<':
|
||||
# << dict delim, or < hex string >
|
||||
if token[1:2] != '<':
|
||||
token = cacheobj(cache, token, PdfString)
|
||||
toktype = PdfString
|
||||
elif firstch == '(':
|
||||
# Literal string
|
||||
# It's probably simple, but maybe not
|
||||
|
@ -145,7 +132,7 @@ class PdfTokens(object):
|
|||
loc, ends, nest = ends
|
||||
token = fdata[m_start:loc] + ')' * nest
|
||||
current[0] = m_start, ends
|
||||
token = cacheobj(cache, token, PdfString)
|
||||
toktype = PdfString
|
||||
elif firstch == '%':
|
||||
# Comment
|
||||
if self.strip_comments:
|
||||
|
@ -154,7 +141,10 @@ class PdfTokens(object):
|
|||
self.exception(('Tokenizer logic incorrect -- '
|
||||
'should never get here'))
|
||||
|
||||
yield token
|
||||
newtok = get_cache(token)
|
||||
if newtok is None:
|
||||
newtok = cache[token] = toktype(token)
|
||||
yield newtok
|
||||
if current[0] is not tokspan:
|
||||
break
|
||||
else:
|
||||
|
@ -168,6 +158,7 @@ class PdfTokens(object):
|
|||
self.iterator = iterator = self._gettoks(startloc)
|
||||
self.msgs_dumped = None if verbose else set()
|
||||
self.next = getattr(iterator, nextattr)
|
||||
self.current = [(startloc, startloc)]
|
||||
|
||||
def setstart(self, startloc):
|
||||
''' Change the starting location.
|
||||
|
@ -213,6 +204,8 @@ class PdfTokens(object):
|
|||
msg %= arg
|
||||
fdata = self.fdata
|
||||
begin, end = self.current[0]
|
||||
if begin >= len(fdata):
|
||||
return '%s (filepos %s past EOF %s)' % (msg, begin, len(fdata))
|
||||
line, col = linepos(fdata, begin)
|
||||
if end > begin:
|
||||
tok = fdata[begin:end].rstrip()
|
||||
|
|
|
@ -108,7 +108,7 @@ def _makearray(rldoc, pdfobj):
|
|||
def _makestr(rldoc, pdfobj):
|
||||
assert isinstance(pdfobj, (float, int, str)), repr(pdfobj)
|
||||
# TODO: Add fix for float like in pdfwriter
|
||||
return str(getattr(pdfobj, 'encoded', pdfobj))
|
||||
return str(getattr(pdfobj, 'encoded', None) or pdfobj)
|
||||
|
||||
|
||||
def makerl_recurse(rldoc, pdfobj):
|
||||
|
|
|
@ -12,7 +12,7 @@ PNG predictor were originally transcribed from PyPDF2, which is
|
|||
probably an excellent source of additional filters.
|
||||
'''
|
||||
import array
|
||||
from .objects import PdfDict, PdfName
|
||||
from .objects import PdfDict, PdfName, PdfArray
|
||||
from .errors import log
|
||||
from .py23_diffs import zlib, xrange, from_array, convert_load, convert_store
|
||||
|
||||
|
@ -37,7 +37,7 @@ def uncompress(mylist, leave_raw=False, warnings=set(),
|
|||
if isinstance(ftype, list) and len(ftype) == 1:
|
||||
# todo: multiple filters
|
||||
ftype = ftype[0]
|
||||
parms = obj.DecodeParms
|
||||
parms = obj.DecodeParms or obj.DP
|
||||
if ftype != flate:
|
||||
msg = ('Not decompressing: cannot use filter %s'
|
||||
' with parameters %s') % (repr(ftype), repr(parms))
|
||||
|
@ -53,10 +53,18 @@ def uncompress(mylist, leave_raw=False, warnings=set(),
|
|||
error = str(s)
|
||||
else:
|
||||
error = None
|
||||
if isinstance(parms, PdfArray):
|
||||
oldparms = parms
|
||||
parms = PdfDict()
|
||||
for x in oldparms:
|
||||
parms.update(x)
|
||||
if parms:
|
||||
predictor = int(parms.Predictor or 1)
|
||||
columns = int(parms.Columns or 1)
|
||||
colors = int(parms.Colors or 1)
|
||||
bpc = int(parms.BitsPerComponent or 8)
|
||||
if 10 <= predictor <= 15:
|
||||
data, error = flate_png(data, parms)
|
||||
data, error = flate_png(data, predictor, columns, colors, bpc)
|
||||
elif predictor != 1:
|
||||
error = ('Unsupported flatedecode predictor %s' %
|
||||
repr(predictor))
|
||||
|
@ -74,7 +82,7 @@ def uncompress(mylist, leave_raw=False, warnings=set(),
|
|||
return ok
|
||||
|
||||
|
||||
def flate_png(data, parms):
|
||||
def flate_png(data, predictor=1, columns=1, colors=1, bpc=8):
|
||||
''' PNG prediction is used to make certain kinds of data
|
||||
more compressible. Before the compression, each data
|
||||
byte is either left the same, or is set to be a delta
|
||||
|
@ -87,9 +95,12 @@ def flate_png(data, parms):
|
|||
this technique for Xref stream objects, which are
|
||||
quite regular.
|
||||
'''
|
||||
columns = int(parms.Columns)
|
||||
columnbytes = ((columns * colors * bpc) + 7) // 8
|
||||
data = array.array('B', data)
|
||||
rowlen = columns + 1
|
||||
rowlen = columnbytes + 1
|
||||
if predictor == 15:
|
||||
padding = (rowlen - len(data)) % rowlen
|
||||
data.extend([0] * padding)
|
||||
assert len(data) % rowlen == 0
|
||||
rows = xrange(0, len(data), rowlen)
|
||||
for row_index in rows:
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
Notes on releasing, which is not yet fully automated:
|
||||
|
||||
1) Update version number both in __init__ and in setup
|
||||
1) Update version number in pdfrw/__init__.py
|
||||
|
||||
2) Use pyroma
|
||||
|
||||
|
|
4
setup.py
4
setup.py
|
@ -27,7 +27,8 @@ setup(
|
|||
'Programming Language :: Python :: 3',
|
||||
'Programming Language :: Python :: 3.3',
|
||||
'Programming Language :: Python :: 3.4',
|
||||
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Topic :: Multimedia :: Graphics :: Graphics Conversion',
|
||||
'Topic :: Software Development :: Libraries',
|
||||
'Topic :: Text Processing',
|
||||
|
@ -35,4 +36,5 @@ setup(
|
|||
'Topic :: Utilities',
|
||||
],
|
||||
keywords='pdf vector graphics PDF nup watermark split join merge',
|
||||
zip_safe=True,
|
||||
)
|
||||
|
|
|
@ -11,8 +11,8 @@ examples/subset_b1c400de699af29ea3f1983bb26870ab_1-3_5 880a9578197130273ccb
|
|||
examples/unspread_d711b74110eefb4e9e6bf1a5bea16bfe 780a9abe26a9de0b5b95ee22c4835e4b
|
||||
|
||||
examples/cat_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c 62bb9b746ff5932d3f1b88942d36a81d
|
||||
examples/rotate_707e3e2d17cbe9ec2273414b3b63f333_270_1-4_7-8_10-50_52-56 841c980dfadf2cc47ad86e4649ca69b6
|
||||
examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c 41989bb2cb6225c6e14262ff5d4f151f
|
||||
examples/rotate_707e3e2d17cbe9ec2273414b3b63f333_270_1-4_7-8_10-50_52-56 7633ba56641115050ba098ecbef8d331
|
||||
examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c fe2330d42b3bfc06212415f295752f0e
|
||||
examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c_-u e43e3ac0afe1cc242549424755dbf612
|
||||
|
||||
# All these are in the poster test
|
||||
|
@ -20,10 +20,10 @@ examples/subset_1975ef8db7355b1d691bc79d0749574b_21 5057f345f1a1109a0e54276a
|
|||
examples/rotate_5057f345f1a1109a0e54276a68e8f8df_90_1 881f4dc8dcf069e707bf61af95492d86
|
||||
examples/poster_881f4dc8dcf069e707bf61af95492d86 a34be06d22105b6c02394a9f278fec0d
|
||||
|
||||
examples/rl1/4up_b1c400de699af29ea3f1983bb26870ab 959d6246ad8bda72bd023e8681216d17
|
||||
examples/rl1/booklet_b1c400de699af29ea3f1983bb26870ab 45b4ae29a038271896b7264bbed63bdf
|
||||
examples/rl1/subset_b1c400de699af29ea3f1983bb26870ab_3_5 822bce1cb9e053f1f3f6b922bf27fab8
|
||||
examples/rl1/platypus_pdf_template_b1c400de699af29ea3f1983bb26870ab 97ad6a8ca3fe7cc4e1f0ffb8475355e9
|
||||
examples/rl1/4up_b1c400de699af29ea3f1983bb26870ab e21dfdd9ae56ddb261dc3d02bf6da198
|
||||
examples/rl1/booklet_b1c400de699af29ea3f1983bb26870ab 410063b7fbae1c6d5af33758e2b43450
|
||||
examples/rl1/subset_b1c400de699af29ea3f1983bb26870ab_3_5 745f1ac31a18d86afb294a449b72cb98
|
||||
examples/rl1/platypus_pdf_template_b1c400de699af29ea3f1983bb26870ab 88bd087c4dc039ced05faea3920cbec5
|
||||
|
||||
# List things that need work here (typically cause exceptions)
|
||||
|
||||
|
@ -68,32 +68,33 @@ repaginate/06c86654f9a77e82f9adaa0086fc391c.pdf 848966fe40a1e3de842f82700dc6d67b
|
|||
repaginate/08f69084d72dabc5dfdcf5c1ff2a719f.pdf b8c60878b0e0ce81cb6e8777038166b1
|
||||
repaginate/09715ec1a7b0f3a7ae02b3046f627b9f.pdf daf7cff9c0a15bbb347489f9fbda25f8
|
||||
repaginate/0a61de50b5ee0ea4d5d69c95dab817a3.pdf c6cd38b1131c4b856f60ebfcf51da6f5
|
||||
repaginate/1975ef8db7355b1d691bc79d0749574b.pdf 53e5510be27db134edf3cf23873914af
|
||||
repaginate/1975ef8db7355b1d691bc79d0749574b.pdf 43433398ccb1edaaee734f4949a5cc3c
|
||||
repaginate/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 20dc3be2affe9082564c01b1146d7598
|
||||
repaginate/1f5dd128c3757420a881a155f2f8ace3.pdf 019aead1450842406a04c508243e5161
|
||||
repaginate/22628a7ed578b622520325673ab2a4f2.pdf 255776a6956918c7b324dede711680ae
|
||||
repaginate/1f5dd128c3757420a881a155f2f8ace3.pdf 7130f1568526247895856806b3879db4
|
||||
repaginate/22628a7ed578b622520325673ab2a4f2.pdf e312c9c588a5ccdb1a11ac37149b178b
|
||||
repaginate/2ac7c68e26a8ef797aead15e4875cc6d.pdf e7344551183415d6257e2cab2aef4a61
|
||||
repaginate/295d26e61a85635433f8e4b768953f60.pdf 13ece51f4d2ad25707982765abbcd789
|
||||
repaginate/295d26e61a85635433f8e4b768953f60.pdf a89a9fa39812ecd9fa5d6b9e785f389d
|
||||
repaginate/2d31f356c37dadd04b83ecc4e9a739a0.pdf bc04b61b41cb51f6a1c1da79fb387795
|
||||
repaginate/2fac0d9a189ca5fcef8626153d050be8.pdf 95fe3d9258ace5bdccb95a55c2c8cb22
|
||||
repaginate/319c998910453bc44d40c7748cd2cb79.pdf c1a19d1acc3f172711bdbea000cf392e
|
||||
repaginate/319c998910453bc44d40c7748cd2cb79.pdf c0da6bf6db273bdb1385f408dcf063d0
|
||||
repaginate/35df0b8cff4afec0c08f08c6a5bc9857.pdf 3568e1c885a461b350c790ec5b729af3
|
||||
repaginate/365b9c95574ee8944370fe286905d0e8.pdf 84e5fc0d4f30ff8db05780fd244d9cf0
|
||||
repaginate/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e
|
||||
repaginate/49e31fd074eca6af981d78d42d0078ec.pdf 77fd3fa86c7c0166a373b66cfef357d2
|
||||
repaginate/536dfc6fbadd87c03eb59375d091eb53.pdf d0b7467d7bd6c7f73b7764b06c0be1aa
|
||||
repaginate/569f8094597bbe5b58efc3a7c6e14e87.pdf 6b0ab50c247ca43b70b2b2f27ee2c1a2
|
||||
repaginate/5f0cff36d0ad74536a6513a98a755016.pdf b65c2557988db8625c0761bab1d131f1
|
||||
repaginate/5f265db2736850782aeaba2571a3c749.pdf 9bb5644ede0ee7cf99642729eda76686
|
||||
repaginate/6a42c8c79b807bf164d31071749e07b0.pdf 33a231263e1a4203338b7b1052fc0091
|
||||
repaginate/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 93419e831e436d9093a153f35d3441c3
|
||||
repaginate/536dfc6fbadd87c03eb59375d091eb53.pdf afc90878b1306483dbde37c3a50b6a45
|
||||
repaginate/569f8094597bbe5b58efc3a7c6e14e87.pdf 894bf526c0a73ab70ebfd9bf3d614315
|
||||
repaginate/5f0cff36d0ad74536a6513a98a755016.pdf 3298a3a13439764102395a34d571ff69
|
||||
repaginate/5f265db2736850782aeaba2571a3c749.pdf 2e3046813ce6e40a39bd759a3c8a3c8c
|
||||
repaginate/6a42c8c79b807bf164d31071749e07b0.pdf bf00d5e44869ae59eb859860d7d5373f
|
||||
repaginate/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 612cdd84eeac797a1c42fc91756b6d9e
|
||||
repaginate/7037a992b80b60f0294016037baa9292.pdf dd41b0104f185206b51e7ffe5b07d261
|
||||
repaginate/707e3e2d17cbe9ec2273414b3b63f333.pdf 6c65526ab372d72cb185933e3d2584ef
|
||||
repaginate/707e3e2d17cbe9ec2273414b3b63f333.pdf df4d756e2230c333f0c58ad354b5b51c
|
||||
repaginate/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2
|
||||
repaginate/72eb207b8f882618899aa7a65d3cecda.pdf 0b64f19a8a39fadfa2a3eec3f1a01233
|
||||
repaginate/97ba0a239cefa0dc727c2f1be050ec6c.pdf a94fe7183ce8979174b2ac16dcd9b1ea
|
||||
repaginate/9d8626d18b1d8807d271e6ffc409446a.pdf cdfcf8add1af9e612ba1a2ee06a6a273
|
||||
repaginate/9f98322c243fe67726d56ccfa8e0885b.pdf 69503ac140a1e4f1322f9350646e3dae
|
||||
repaginate/c55eb9a13859a7fbddd8af9c16eba3a7.pdf b0d1f3925423f9c3ecf4a47baa949f75
|
||||
repaginate/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 8cddb0f9741f7515107b1bce5dc90c83
|
||||
repaginate/c5c895deecf7a7565393587e0d61be2b.pdf 59e350c6f7d7b89fab36a4019bb526fd
|
||||
repaginate/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf 3623b7f200818c63cb6838f9678a4840
|
||||
repaginate/d6fd9567078b48c86710e9c49173781f.pdf 874b532f61139261f71afb5987dd2a68
|
||||
|
@ -101,6 +102,7 @@ repaginate/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 7d3c3ae13cc7d53e7fa6ef046e15dbaa
|
|||
repaginate/ec00d5825f47b9d0faa953b1709163c3.pdf 8e6a481476c2b3bdd64ce8e36f8fe273
|
||||
repaginate/ed81787b83cc317c9f049643b853bea3.pdf 4636b68f294302417b81aaaadde1c73d
|
||||
|
||||
|
||||
simple/06c86654f9a77e82f9adaa0086fc391c.pdf 6e2a2e063de895d28dfea9aacb9fe469
|
||||
simple/08f69084d72dabc5dfdcf5c1ff2a719f.pdf 5a41601f6033356539e623091a3f79ef
|
||||
simple/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334
|
||||
|
@ -111,6 +113,7 @@ simple/1f5dd128c3757420a881a155f2f8ace3.pdf b0d01f9d6ac156326aeb14b940aa73e7
|
|||
simple/22628a7ed578b622520325673ab2a4f2.pdf 1163cec415728899e997a29be465d02d
|
||||
simple/295d26e61a85635433f8e4b768953f60.pdf fe3b8960c7f877db05c7cd12c9c6e097
|
||||
simple/2ac7c68e26a8ef797aead15e4875cc6d.pdf 2623eae06eada9587574f8ddd7fc80fa
|
||||
simple/2d31f356c37dadd04b83ecc4e9a739a0.pdf 9af4794d366fbd5840836e6612ceedd2
|
||||
simple/2fac0d9a189ca5fcef8626153d050be8.pdf 458501ecda909b00262b9654f0b09ebf
|
||||
simple/319c998910453bc44d40c7748cd2cb79.pdf 8c84e36ec1db8c1dbfaa312646e000b4
|
||||
simple/35df0b8cff4afec0c08f08c6a5bc9857.pdf 0a2926c23ad916c449d5dadcfa9d38ef
|
||||
|
@ -124,7 +127,7 @@ simple/5f265db2736850782aeaba2571a3c749.pdf d4d2e93ab22e866c86e32da84421f6f9
|
|||
simple/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013
|
||||
simple/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf fe8dd16dd7fef40338140e0610d0cbbf
|
||||
simple/7037a992b80b60f0294016037baa9292.pdf 6a2ef24e5f74dd74969ff8cefdfc6a05
|
||||
simple/707e3e2d17cbe9ec2273414b3b63f333.pdf 4bdf1e57a96ce42717110b4e55098c1a
|
||||
simple/707e3e2d17cbe9ec2273414b3b63f333.pdf fb6a8eb3cdc2fbef125babe8815f3b70
|
||||
simple/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2
|
||||
simple/72eb207b8f882618899aa7a65d3cecda.pdf 4ce7ff29531cc417c26389af28dc1c5e
|
||||
simple/97ba0a239cefa0dc727c2f1be050ec6c.pdf c24873bab85b8ecc7c5433d8d802bceb
|
||||
|
@ -138,3 +141,85 @@ simple/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 5bc96989bc4f4b6438da953443336124
|
|||
simple/ec00d5825f47b9d0faa953b1709163c3.pdf 708f66049169c28ac39b0553908dc318
|
||||
simple/ed81787b83cc317c9f049643b853bea3.pdf c227d627217dc6808c50e80063734d27
|
||||
|
||||
|
||||
decompress/06c86654f9a77e82f9adaa0086fc391c.pdf 6e2a2e063de895d28dfea9aacb9fe469
|
||||
decompress/07b0ba4cff1c6ff73fd468b04b013457.pdf 499b9c1b1e1c76b7c5c0d5e3b62889e3
|
||||
decompress/08f69084d72dabc5dfdcf5c1ff2a719f.pdf ccadb859eff77d525bf86f6d821ccf1b
|
||||
decompress/09715ec1a7b0f3a7ae02b3046f627b9f.pdf 2b9c8b26a92c7645cfefa1bfa8a8ab36
|
||||
decompress/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334
|
||||
decompress/1975ef8db7355b1d691bc79d0749574b.pdf a7d5eaf0a4259352898047f284e20b90
|
||||
decompress/1c2af1d2b0db6cac3c8e558a26efd38b.pdf 40d1cc7e26213510319b519032aff637
|
||||
decompress/1f5dd128c3757420a881a155f2f8ace3.pdf b0d01f9d6ac156326aeb14b940aa73e7
|
||||
decompress/22628a7ed578b622520325673ab2a4f2.pdf b68c7bf46ad4b70addc3369ba669dc7b
|
||||
decompress/295d26e61a85635433f8e4b768953f60.pdf 6f2ae8fb0ff853ed63537d8767ce13ad
|
||||
decompress/2ac7c68e26a8ef797aead15e4875cc6d.pdf d8d5589991ce15c834f35b340e7147a9
|
||||
decompress/2d31f356c37dadd04b83ecc4e9a739a0.pdf 5a6b732690c42f07ae6a41c37cf28ff3
|
||||
decompress/2fac0d9a189ca5fcef8626153d050be8.pdf 998366ad30becd31bed711ba78c59a7f
|
||||
decompress/319c998910453bc44d40c7748cd2cb79.pdf 7933a591caf3d49e45a42733bc48f99e
|
||||
decompress/35df0b8cff4afec0c08f08c6a5bc9857.pdf e339ae7747898d2faba270473171692a
|
||||
decompress/365b9c95574ee8944370fe286905d0e8.pdf 9da0100b5844c86e93093d0fbc78b3f6
|
||||
decompress/4805fdcd7e142e8df3c04c6ba06025af.pdf 3b5b8254dc99c2f0f62fe2afa42fad4e
|
||||
decompress/49e31fd074eca6af981d78d42d0078ec.pdf 4e9bf31753ff7232de4c612a31bd21fc
|
||||
decompress/536dfc6fbadd87c03eb59375d091eb53.pdf f755d2ef6052270121168d2341ad04b6
|
||||
decompress/569f8094597bbe5b58efc3a7c6e14e87.pdf aa782a7d553ec767ab61517996337f58
|
||||
decompress/5f0cff36d0ad74536a6513a98a755016.pdf 9caae4e3a21eba9e4aa76620e7508d56
|
||||
decompress/5f265db2736850782aeaba2571a3c749.pdf 836abcf6e6e1d39ad96481eb20e9b149
|
||||
decompress/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013
|
||||
decompress/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 226773cac79e1a5fed1379a0501a5df0
|
||||
decompress/7037a992b80b60f0294016037baa9292.pdf c9a3602b26d82ae145d9f5822125a158
|
||||
decompress/707e3e2d17cbe9ec2273414b3b63f333.pdf 3250a56e14a9855eccd67bb347808d24
|
||||
decompress/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2
|
||||
decompress/72eb207b8f882618899aa7a65d3cecda.pdf a4366874fb6db1d9a0c998361ea32b8d
|
||||
decompress/97ba0a239cefa0dc727c2f1be050ec6c.pdf c24873bab85b8ecc7c5433d8d802bceb
|
||||
decompress/9d8626d18b1d8807d271e6ffc409446a.pdf 6498bd354bb221516517a4c49bcb94f6
|
||||
decompress/9f98322c243fe67726d56ccfa8e0885b.pdf 4b53b63b0779b81d8f9569e66ca3d8ee
|
||||
decompress/b107669d1dd69eabb89765fabb2cb321.pdf 56025c06ab8633575ddc6c6990d2fbf1
|
||||
decompress/b1c400de699af29ea3f1983bb26870ab.pdf 08a5de62129a96d8d9a8f27052bfb227
|
||||
decompress/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 8e0eb14c12fc89e7cbb4001861d7198f
|
||||
decompress/c5c895deecf7a7565393587e0d61be2b.pdf 2cc3c75e56d5dd562ca5b1f994bd9d5c
|
||||
decompress/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf aaed7215c60dbf19bb4fefe88602196a
|
||||
decompress/d6fd9567078b48c86710e9c49173781f.pdf 1fd1b4bc184e64ea6260c30261adf9c4
|
||||
decompress/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf 62b87ec47f1b93d75c32d0c78b6c2380
|
||||
decompress/ec00d5825f47b9d0faa953b1709163c3.pdf 708f66049169c28ac39b0553908dc318
|
||||
decompress/ed81787b83cc317c9f049643b853bea3.pdf 5c0a3bc5b19d58d48767bff8f31daae0
|
||||
|
||||
compress/06c86654f9a77e82f9adaa0086fc391c.pdf b6fb771b49971f2b63a197f3ef1531aa
|
||||
compress/07b0ba4cff1c6ff73fd468b04b013457.pdf 499b9c1b1e1c76b7c5c0d5e3b62889e3
|
||||
compress/08f69084d72dabc5dfdcf5c1ff2a719f.pdf 3e7e53a92f96d52bbffe3ffa03d7b11e
|
||||
compress/09715ec1a7b0f3a7ae02b3046f627b9f.pdf 563ffde527978517393d9166b02c17d3
|
||||
compress/0a61de50b5ee0ea4d5d69c95dab817a3.pdf 182712dd5be8aebd29decb57cf530334
|
||||
compress/1975ef8db7355b1d691bc79d0749574b.pdf d505caa75f8becea1a1c810f4a143976
|
||||
compress/1c2af1d2b0db6cac3c8e558a26efd38b.pdf b78f4e45aef4149a068a0225ea1be88c
|
||||
compress/1f5dd128c3757420a881a155f2f8ace3.pdf 22148c2a65129f936b8e8c67397e5bf6
|
||||
compress/22628a7ed578b622520325673ab2a4f2.pdf 54ec1fa64e64bfd146f13001444346f4
|
||||
compress/295d26e61a85635433f8e4b768953f60.pdf 2ed8eb04a8c66138883a43917cd9c0c5
|
||||
compress/2ac7c68e26a8ef797aead15e4875cc6d.pdf efe942d1e5b9f2f139c7e1f2e46ced24
|
||||
compress/2d31f356c37dadd04b83ecc4e9a739a0.pdf eedc938e6782e1d15755b5c54fffc17c
|
||||
compress/2fac0d9a189ca5fcef8626153d050be8.pdf 2d1b8e82cdc82c82bec3969acf026d30
|
||||
compress/319c998910453bc44d40c7748cd2cb79.pdf 5b9ca8444a17db8cb6fa427da7a89e44
|
||||
compress/35df0b8cff4afec0c08f08c6a5bc9857.pdf 07c064df0fc0fd0c80c4a196b4c38403
|
||||
compress/365b9c95574ee8944370fe286905d0e8.pdf 1b98e92f74c2f5324cce5fc8fbe46c15
|
||||
compress/4805fdcd7e142e8df3c04c6ba06025af.pdf 4aa2e922739ba865da30a9917ddffe8e
|
||||
compress/49e31fd074eca6af981d78d42d0078ec.pdf 7422b3d205650552ff81bc06c89c13ba
|
||||
compress/536dfc6fbadd87c03eb59375d091eb53.pdf c18b0f0f8e633fe15b17772c701a76a9
|
||||
compress/569f8094597bbe5b58efc3a7c6e14e87.pdf 3ee711f7fc678787346dca5d06ee5192
|
||||
compress/5f0cff36d0ad74536a6513a98a755016.pdf bd2a1edf6299d5dc2e1ad6b5fc8bcc20
|
||||
compress/5f265db2736850782aeaba2571a3c749.pdf bb4898beac50171de7502f13925af80c
|
||||
compress/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013
|
||||
compress/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 1c3fbae41e7cad7deca13fab93514bc7
|
||||
compress/7037a992b80b60f0294016037baa9292.pdf 9182a9765544e4a91404db65a6f951d7
|
||||
compress/707e3e2d17cbe9ec2273414b3b63f333.pdf 0e75dda73bf18d9968499277ab1a367e
|
||||
compress/71a751ce2d93a6a5d6ff21735b701fb7.pdf faa7eb31789a3789f65de30a4e58e594
|
||||
compress/72eb207b8f882618899aa7a65d3cecda.pdf 0155549fc04357220cc6be541dda7bc1
|
||||
compress/97ba0a239cefa0dc727c2f1be050ec6c.pdf 067bfee3b2bd9c250e7c4157ff543a81
|
||||
compress/9d8626d18b1d8807d271e6ffc409446a.pdf 7c124d2d0b0c7b21cce91740dfb2a8fd
|
||||
compress/9f98322c243fe67726d56ccfa8e0885b.pdf 3167fa11a3f1f4a06f90294b21e101b7
|
||||
compress/b107669d1dd69eabb89765fabb2cb321.pdf 56025c06ab8633575ddc6c6990d2fbf1
|
||||
compress/b1c400de699af29ea3f1983bb26870ab.pdf 6eaeef32b0e28959e7681c8b02d8814f
|
||||
compress/c55eb9a13859a7fbddd8af9c16eba3a7.pdf 6ef82921011eb79a9d860214e213c868
|
||||
compress/c5c895deecf7a7565393587e0d61be2b.pdf 30d87ac6aa59d65169c389ee3badbca8
|
||||
compress/d2f0b2086160d4f3d325c79a5dc1fb4d.pdf e4c768be930e9980c970d51d5f447e24
|
||||
compress/d6fd9567078b48c86710e9c49173781f.pdf cbc8922b8bea08928463b287767ec229
|
||||
compress/e9ab02aa769f4c040a6fa52f00d6e3f0.pdf e893e407b3c2366d4ca822ce80b45c2c
|
||||
compress/ec00d5825f47b9d0faa953b1709163c3.pdf 9ba3db0dedec74c3d2a6f033f1b22a81
|
||||
compress/ed81787b83cc317c9f049643b853bea3.pdf 2ceda401f68a44a3fb1da4e0f9dfc578
|
||||
|
|
|
@ -96,7 +96,7 @@ class TestOnePdf(unittest.TestCase):
|
|||
os.remove(scrub)
|
||||
subprocess.call(params)
|
||||
if scrub:
|
||||
PdfWriter().addpages(PdfReader(scrub).pages).write(dstf)
|
||||
PdfWriter(dstf).addpages(PdfReader(scrub).pages).write()
|
||||
with open(dstf, 'rb') as f:
|
||||
data = f.read()
|
||||
size = len(data)
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
#! /usr/bin/env python
|
||||
# encoding: utf-8
|
||||
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||
# Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas
|
||||
# 2016 James Laird-Wah, Sydney, Australia
|
||||
# MIT license -- See LICENSE.txt for details
|
||||
|
||||
'''
|
||||
Run from the directory above like so:
|
||||
python -m tests.test_pdfstring
|
||||
'''
|
||||
|
||||
|
||||
from pdfrw import PdfDict, PdfName
|
||||
from pdfrw.objects import PdfIndirect
|
||||
|
||||
import unittest
|
||||
|
||||
|
||||
class TestPdfDicts(unittest.TestCase):
|
||||
|
||||
def test_indirect_set_get(self):
|
||||
io = PdfIndirect((1,2,3))
|
||||
io.value = 42
|
||||
d = PdfDict()
|
||||
d.Name = io
|
||||
test, = (x for x in dict.values(d))
|
||||
self.assertEqual(test, io)
|
||||
v = d['/Name']
|
||||
self.assertEqual(v, io.value)
|
||||
test, = d
|
||||
self.assertEqual(type(test), type(PdfName.Name))
|
||||
|
||||
def main():
|
||||
unittest.main()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -0,0 +1,28 @@
|
|||
#! /usr/bin/env python
|
||||
import static_pdfs
|
||||
|
||||
from pdfrw import PdfReader
|
||||
|
||||
try:
|
||||
import unittest2 as unittest
|
||||
except ImportError:
|
||||
import unittest
|
||||
|
||||
|
||||
class TestPdfReaderInit(unittest.TestCase):
|
||||
|
||||
def test_fname_binary_filelike(self):
|
||||
with open(static_pdfs.pdffiles[0][0], 'rb') as pdf_file:
|
||||
PdfReader(pdf_file)
|
||||
|
||||
def test_fdata_binary(self):
|
||||
with open(static_pdfs.pdffiles[0][0], 'rb') as pdf_file:
|
||||
pdf_bytes = pdf_file.read()
|
||||
PdfReader(fdata=pdf_bytes)
|
||||
|
||||
|
||||
def main():
|
||||
unittest.main()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -1,4 +1,9 @@
|
|||
#! /usr/bin/env python
|
||||
# encoding: utf-8
|
||||
# A part of pdfrw (https://github.com/pmaupin/pdfrw)
|
||||
# Copyright (C) 2006-2017 Patrick Maupin, Austin, Texas
|
||||
# 2016 James Laird-Wah, Sydney, Australia
|
||||
# MIT license -- See LICENSE.txt for details
|
||||
|
||||
'''
|
||||
Run from the directory above like so:
|
||||
|
@ -6,30 +11,106 @@ python -m tests.test_pdfstring
|
|||
'''
|
||||
|
||||
|
||||
import pdfrw
|
||||
from pdfrw import PdfString
|
||||
from pdfrw.py23_diffs import convert_store
|
||||
|
||||
import unittest
|
||||
|
||||
|
||||
class TestEncoding(unittest.TestCase):
|
||||
class TestBaseEncoding(unittest.TestCase):
|
||||
|
||||
@staticmethod
|
||||
def decode(value):
|
||||
return pdfrw.objects.PdfString(value).decode()
|
||||
def encode(self, value):
|
||||
x = PdfString.encode(value)
|
||||
if isinstance(value, type(u'')):
|
||||
y = PdfString.from_unicode(value)
|
||||
else:
|
||||
y = PdfString.from_bytes(value)
|
||||
self.assertEqual(x, y)
|
||||
return x
|
||||
|
||||
@staticmethod
|
||||
def encode(value):
|
||||
return str(pdfrw.objects.PdfString.encode(value))
|
||||
def decode(self, value):
|
||||
s = PdfString(value)
|
||||
x = s.to_unicode()
|
||||
y = s.decode()
|
||||
self.assertEqual(x, y)
|
||||
return x
|
||||
|
||||
@classmethod
|
||||
def encode_decode(cls, value):
|
||||
return cls.decode(cls.encode(value))
|
||||
def decode_bytes(self, decode_this, expected):
|
||||
""" Decode to bytes"""
|
||||
self.assertEqual(PdfString(decode_this).to_bytes(),
|
||||
convert_store(expected))
|
||||
|
||||
def roundtrip(self, value):
|
||||
self.assertEqual(value, self.encode_decode(value))
|
||||
def roundtrip(self, value, expected=None):
|
||||
result = self.encode(value)
|
||||
self.assertEqual(value, self.decode(result))
|
||||
if expected is not None:
|
||||
self.assertEqual(result, expected)
|
||||
return result
|
||||
|
||||
def test_doubleslash(self):
|
||||
self.roundtrip('\\')
|
||||
self.roundtrip(r'\\')
|
||||
|
||||
def test_unicode_encoding(self):
|
||||
# These chars are in PdfDocEncoding
|
||||
self.assertEqual(self.roundtrip(u'PDF™©®')[0], '(')
|
||||
# These chars are not in PdfDocEncoding
|
||||
self.assertEqual(self.roundtrip(u'δΩσ')[0], '<')
|
||||
# Check that we're doing a reasonable encoding
|
||||
# Might want to change this later if we change the definition of reasonable
|
||||
self.roundtrip(u'(\n\u00FF', '(\\(\n\xff)')
|
||||
self.roundtrip(u'(\n\u0101', '<FEFF0028000A0101>')
|
||||
|
||||
|
||||
def test_constructor(self):
|
||||
obj = PdfString('hello')
|
||||
|
||||
def test_continuation(self):
|
||||
# See PDF 1.7 ref section 3.2 page 55
|
||||
s1 = PdfString('(These two strings are the same.)')
|
||||
self.assertEqual(s1.decode(), s1[1:-1])
|
||||
s2 = PdfString('(These \\\ntwo strings \\\nare the same.)')
|
||||
self.assertEqual(s1.decode(), s2.decode())
|
||||
s2 = PdfString(s2.replace('\n', '\r'))
|
||||
self.assertEqual(s1.decode(), s2.decode())
|
||||
s2 = PdfString(s2.replace('\r', '\r\n'))
|
||||
self.assertEqual(s1.decode(), s2.decode())
|
||||
|
||||
def test_hex_whitespace(self):
|
||||
# See PDF 1.7 ref section 3.2 page 56
|
||||
self.assertEqual(self.decode('<41 \n\r\t\f\v42>'), 'AB')
|
||||
|
||||
def test_unicode_escaped_decode(self):
|
||||
# Some PDF producers happily put unicode strings in PdfDocEncoding,
|
||||
# because the Unicode BOM and \0 are valid code points
|
||||
decoded = self.decode('(\xfe\xff\0h\0e\0l\0l\0o)')
|
||||
self.assertEqual(decoded, "hello")
|
||||
|
||||
|
||||
def test_unescaping(self):
|
||||
self.decode_bytes(r'( \( \) \\ \n \t \f \r \r\n \\n)',
|
||||
' ( ) \\ \n \t \f \r \r\n \\n')
|
||||
|
||||
self.decode_bytes(r'(\b\010\10)', '\b\b\b')
|
||||
self.decode_bytes('(\\n\n\\r\r\\t\t\\b\b\\f\f()\\1\\23\\0143)',
|
||||
'\n\n\r\r\t\t\b\b\f\f()\001\023\f3')
|
||||
self.decode_bytes(r'(\\\nabc)', '\\\nabc')
|
||||
self.decode_bytes(r'(\ )', ' ')
|
||||
|
||||
def test_BOM_variants(self):
|
||||
self.roundtrip(u'\ufeff', '<FEFFFEFF>')
|
||||
self.roundtrip(u'\ufffe', '<FEFFFFFE>')
|
||||
self.roundtrip(u'\xfe\xff', '<FEFF00FE00FF>')
|
||||
self.roundtrip(u'\xff\xfe', '(\xff\xfe)')
|
||||
self.assertRaises(UnicodeError, PdfString.from_unicode,
|
||||
u'þÿ blah', text_encoding='pdfdocencoding')
|
||||
|
||||
def test_byte_encode(self):
|
||||
self.assertEqual(self.encode(b'ABC'), '(ABC)')
|
||||
|
||||
def test_nullstring(self):
|
||||
self.assertEqual(PdfString('<>').to_bytes(), b'')
|
||||
self.assertEqual(PdfString('()').to_bytes(), b'')
|
||||
|
||||
def main():
|
||||
unittest.main()
|
||||
|
|
|
@ -79,11 +79,12 @@ class TestOnePdf(unittest.TestCase):
|
|||
result = 'skip -- encrypt'
|
||||
hash = '------skip-encrypt-no-file------'
|
||||
return self.skipTest('File encrypted')
|
||||
writer = pdfrw.PdfWriter(compress=compress)
|
||||
writer = pdfrw.PdfWriter(dstf, compress=compress)
|
||||
if repaginate:
|
||||
writer.addpages(trailer.pages)
|
||||
trailer = None
|
||||
writer.write(dstf, trailer)
|
||||
else:
|
||||
writer.trailer = trailer
|
||||
writer.write()
|
||||
with open(dstf, 'rb') as f:
|
||||
data = f.read()
|
||||
size = len(data)
|
||||
|
@ -112,15 +113,20 @@ def build_tests():
|
|||
def test(self):
|
||||
self.roundtrip(*args, **kw)
|
||||
return test
|
||||
for mytest, repaginate in (
|
||||
('simple', False),
|
||||
('repaginate', True)
|
||||
for mytest, repaginate, decompress, compress in (
|
||||
('simple', False, False, False),
|
||||
('repaginate', True, False, False),
|
||||
('decompress', False, True, False),
|
||||
('compress', False, True, True),
|
||||
):
|
||||
for srcf in static_pdfs.pdffiles[0]:
|
||||
basename = os.path.basename(srcf)
|
||||
test_name = 'test_%s_%s' % (mytest, basename)
|
||||
test = test_closure(mytest, basename, srcf,
|
||||
repaginate=repaginate)
|
||||
repaginate=repaginate,
|
||||
decompress=decompress,
|
||||
compress=compress,
|
||||
)
|
||||
setattr(TestOnePdf, test_name, test)
|
||||
build_tests()
|
||||
|
||||
|
|
|
@ -0,0 +1,84 @@
|
|||
#! /usr/bin/env python2
|
||||
"""
|
||||
Put old (good) results in ramdisk/reference,
|
||||
then generate new (unknown) test results in ramdisk/tmp_results,
|
||||
THEN SWITCH BACK TO KNOWN GOOD SYSTEM, and finally:
|
||||
|
||||
run this to update any checksums in expected.txt where both versions
|
||||
parse to same PDFs.
|
||||
"""
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
from pdfrw import PdfReader, PdfWriter, PdfArray, PdfDict, PdfObject
|
||||
|
||||
|
||||
def make_canonical(trailer):
|
||||
''' Canonicalizes a PDF. Assumes everything
|
||||
is a Pdf object already.
|
||||
'''
|
||||
visited = set()
|
||||
workitems = list(trailer.values())
|
||||
while workitems:
|
||||
obj = workitems.pop()
|
||||
objid = id(obj)
|
||||
if objid in visited:
|
||||
continue
|
||||
visited.add(objid)
|
||||
obj.indirect = True
|
||||
if isinstance(obj, (PdfArray, PdfDict)):
|
||||
if isinstance(obj, PdfArray):
|
||||
workitems += obj
|
||||
else:
|
||||
workitems += obj.values()
|
||||
return trailer
|
||||
|
||||
with open('expected.txt', 'rb') as f:
|
||||
expected = f.read()
|
||||
|
||||
def get_digest(fname):
|
||||
with open(fname, 'rb') as f:
|
||||
data = f.read()
|
||||
if data:
|
||||
return hashlib.md5(data).hexdigest()
|
||||
|
||||
tmp = '_temp.pdf'
|
||||
count = 0
|
||||
goodcount = 0
|
||||
|
||||
changes = []
|
||||
for (srcpath, _, filenames) in os.walk('ramdisk/reference'):
|
||||
for name in filenames:
|
||||
if not name.endswith('.pdf'):
|
||||
continue
|
||||
src = os.path.join(srcpath, name)
|
||||
dst = src.replace('/reference/', '/tmp_results/')
|
||||
if not os.path.exists(dst):
|
||||
continue
|
||||
src_digest = get_digest(src)
|
||||
if not src_digest or src_digest not in expected:
|
||||
continue
|
||||
print src
|
||||
count += 1
|
||||
trailer = make_canonical(PdfReader(src))
|
||||
out = PdfWriter(tmp)
|
||||
out.write(trailer=trailer)
|
||||
match_digest = get_digest(tmp)
|
||||
if not match_digest:
|
||||
continue
|
||||
trailer = make_canonical(PdfReader(dst))
|
||||
out = PdfWriter(tmp)
|
||||
out.write(trailer=trailer)
|
||||
if get_digest(tmp) != match_digest:
|
||||
continue
|
||||
goodcount += 1
|
||||
print "OK"
|
||||
changes.append((src_digest, get_digest(dst)))
|
||||
|
||||
print count, goodcount
|
||||
|
||||
for stuff in changes:
|
||||
expected = expected.replace(*stuff)
|
||||
|
||||
with open('expected.txt', 'wb') as f:
|
||||
f.write(expected)
|
Loading…
Reference in New Issue