401 lines
13 KiB
Python
401 lines
13 KiB
Python
# vim: sw=4:expandtab:foldmethod=marker
|
|
#
|
|
# Copyright (c) 2006, Mathieu Fenniak
|
|
# All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are
|
|
# met:
|
|
#
|
|
# * Redistributions of source code must retain the above copyright notice,
|
|
# this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
# this list of conditions and the following disclaimer in the documentation
|
|
# and/or other materials provided with the distribution.
|
|
# * The name of the author may not be used to endorse or promote products
|
|
# derived from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
from generic import *
|
|
from pdf import PdfFileReader, PdfFileWriter, Destination
|
|
|
|
class _MergedPage(object):
|
|
"""
|
|
_MergedPage is used internally by PdfFileMerger to collect necessary information on each page that is being merged.
|
|
"""
|
|
def __init__(self, pagedata, src, id):
|
|
self.src = src
|
|
self.pagedata = pagedata
|
|
self.out_pagedata = None
|
|
self.id = id
|
|
|
|
class PdfFileMerger(object):
|
|
"""
|
|
PdfFileMerger merges multiple PDFs into a single PDF. It can concatenate,
|
|
slice, insert, or any combination of the above.
|
|
|
|
See the functions "merge" (or "append") and "write" (or "overwrite") for
|
|
usage information.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""
|
|
>>> PdfFileMerger()
|
|
|
|
Initializes a PdfFileMerger, no parameters required
|
|
"""
|
|
self.inputs = []
|
|
self.pages = []
|
|
self.output = PdfFileWriter()
|
|
self.bookmarks = []
|
|
self.named_dests = []
|
|
self.id_count = 0
|
|
|
|
def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=True):
|
|
"""
|
|
>>> merge(position, file, bookmark=None, pages=None, import_bookmarks=True)
|
|
|
|
Merges the pages from the source document specified by "file" into the output
|
|
file at the page number specified by "position".
|
|
|
|
Optionally, you may specify a bookmark to be applied at the beginning of the
|
|
included file by supplying the text of the bookmark in the "bookmark" parameter.
|
|
|
|
You may prevent the source document's bookmarks from being imported by
|
|
specifying "import_bookmarks" as False.
|
|
|
|
You may also use the "pages" parameter to merge only the specified range of
|
|
pages from the source document into the output document.
|
|
"""
|
|
|
|
my_file = False
|
|
if type(fileobj) in (str, unicode):
|
|
fileobj = file(fileobj, 'rb')
|
|
my_file = True
|
|
|
|
if type(fileobj) == PdfFileReader:
|
|
pdfr = fileobj
|
|
fileobj = pdfr.file
|
|
else:
|
|
pdfr = PdfFileReader(fileobj)
|
|
|
|
# Find the range of pages to merge
|
|
if pages == None:
|
|
pages = (0, pdfr.getNumPages())
|
|
elif type(pages) in (int, float, str, unicode):
|
|
raise TypeError('"pages" must be a tuple of (start, end)')
|
|
|
|
srcpages = []
|
|
|
|
if bookmark:
|
|
bookmark = Bookmark(TextStringObject(bookmark), NumberObject(self.id_count), NameObject('/Fit'))
|
|
|
|
outline = []
|
|
if import_bookmarks:
|
|
outline = pdfr.getOutlines()
|
|
outline = self._trim_outline(pdfr, outline, pages)
|
|
|
|
if bookmark:
|
|
self.bookmarks += [bookmark, outline]
|
|
else:
|
|
self.bookmarks += outline
|
|
|
|
dests = pdfr.namedDestinations
|
|
dests = self._trim_dests(pdfr, dests, pages)
|
|
self.named_dests += dests
|
|
|
|
# Gather all the pages that are going to be merged
|
|
for i in range(*pages):
|
|
pg = pdfr.getPage(i)
|
|
|
|
id = self.id_count
|
|
self.id_count += 1
|
|
|
|
mp = _MergedPage(pg, pdfr, id)
|
|
|
|
srcpages.append(mp)
|
|
|
|
self._associate_dests_to_pages(srcpages)
|
|
self._associate_bookmarks_to_pages(srcpages)
|
|
|
|
|
|
# Slice to insert the pages at the specified position
|
|
self.pages[position:position] = srcpages
|
|
|
|
# Keep track of our input files so we can close them later
|
|
self.inputs.append((fileobj, pdfr, my_file))
|
|
|
|
|
|
def append(self, fileobj, bookmark=None, pages=None, import_bookmarks=True):
|
|
"""
|
|
>>> append(file, bookmark=None, pages=None, import_bookmarks=True):
|
|
|
|
Identical to the "merge" function, but assumes you want to concatenate all pages
|
|
onto the end of the file instead of specifying a position.
|
|
"""
|
|
|
|
self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks)
|
|
|
|
|
|
def write(self, fileobj):
|
|
"""
|
|
>>> write(file)
|
|
|
|
Writes all data that has been merged to "file" (which can be a filename or any
|
|
kind of file-like object)
|
|
"""
|
|
my_file = False
|
|
if type(fileobj) in (str, unicode):
|
|
fileobj = file(fileobj, 'wb')
|
|
my_file = True
|
|
|
|
|
|
# Add pages to the PdfFileWriter
|
|
for page in self.pages:
|
|
self.output.addPage(page.pagedata)
|
|
page.out_pagedata = self.output.getReference(self.output._pages.getObject()["/Kids"][-1].getObject())
|
|
|
|
|
|
# Once all pages are added, create bookmarks to point at those pages
|
|
self._write_dests()
|
|
self._write_bookmarks()
|
|
|
|
# Write the output to the file
|
|
self.output.write(fileobj)
|
|
|
|
if my_file:
|
|
fileobj.close()
|
|
|
|
|
|
|
|
def close(self):
|
|
"""
|
|
>>> close()
|
|
|
|
Shuts all file descriptors (input and output) and clears all memory usage
|
|
"""
|
|
self.pages = []
|
|
for fo, pdfr, mine in self.inputs:
|
|
if mine:
|
|
fo.close()
|
|
|
|
self.inputs = []
|
|
self.output = None
|
|
|
|
def _trim_dests(self, pdf, dests, pages):
|
|
"""
|
|
Removes any named destinations that are not a part of the specified page set
|
|
"""
|
|
new_dests = []
|
|
prev_header_added = True
|
|
for k, o in dests.items():
|
|
for j in range(*pages):
|
|
if pdf.getPage(j).getObject() == o['/Page'].getObject():
|
|
o[NameObject('/Page')] = o['/Page'].getObject()
|
|
assert str(k) == str(o['/Title'])
|
|
new_dests.append(o)
|
|
break
|
|
return new_dests
|
|
|
|
def _trim_outline(self, pdf, outline, pages):
|
|
"""
|
|
Removes any outline/bookmark entries that are not a part of the specified page set
|
|
"""
|
|
new_outline = []
|
|
prev_header_added = True
|
|
for i, o in enumerate(outline):
|
|
if type(o) == list:
|
|
sub = self._trim_outline(pdf, o, pages)
|
|
if sub:
|
|
if not prev_header_added:
|
|
new_outline.append(outline[i-1])
|
|
new_outline.append(sub)
|
|
else:
|
|
prev_header_added = False
|
|
for j in range(*pages):
|
|
if pdf.getPage(j).getObject() == o['/Page'].getObject():
|
|
o[NameObject('/Page')] = o['/Page'].getObject()
|
|
new_outline.append(o)
|
|
prev_header_added = True
|
|
break
|
|
return new_outline
|
|
|
|
def _write_dests(self):
|
|
dests = self.named_dests
|
|
|
|
for v in dests:
|
|
pageno = None
|
|
pdf = None
|
|
if v.has_key('/Page'):
|
|
for i, p in enumerate(self.pages):
|
|
if p.id == v['/Page']:
|
|
v[NameObject('/Page')] = p.out_pagedata
|
|
pageno = i
|
|
pdf = p.src
|
|
if pageno != None:
|
|
self.output.addNamedDestinationObject(v)
|
|
|
|
def _write_bookmarks(self, bookmarks=None, parent=None):
|
|
|
|
if bookmarks == None:
|
|
bookmarks = self.bookmarks
|
|
|
|
|
|
last_added = None
|
|
for b in bookmarks:
|
|
if type(b) == list:
|
|
self._write_bookmarks(b, last_added)
|
|
continue
|
|
|
|
pageno = None
|
|
pdf = None
|
|
if b.has_key('/Page'):
|
|
for i, p in enumerate(self.pages):
|
|
if p.id == b['/Page']:
|
|
b[NameObject('/Page')] = p.out_pagedata
|
|
pageno = i
|
|
pdf = p.src
|
|
if pageno != None:
|
|
last_added = self.output.addBookmarkDestination(b, parent)
|
|
|
|
|
|
def _associate_dests_to_pages(self, pages):
|
|
for nd in self.named_dests:
|
|
pageno = None
|
|
np = nd['/Page']
|
|
|
|
if type(np) == NumberObject:
|
|
continue
|
|
|
|
for p in pages:
|
|
if np.getObject() == p.pagedata.getObject():
|
|
pageno = p.id
|
|
|
|
if pageno != None:
|
|
nd[NameObject('/Page')] = NumberObject(pageno)
|
|
else:
|
|
raise ValueError, "Unresolved named destination '%s'" % (nd['/Title'],)
|
|
|
|
def _associate_bookmarks_to_pages(self, pages, bookmarks=None):
|
|
if bookmarks == None:
|
|
bookmarks = self.bookmarks
|
|
|
|
for b in bookmarks:
|
|
if type(b) == list:
|
|
self._associate_bookmarks_to_pages(pages, b)
|
|
continue
|
|
|
|
pageno = None
|
|
bp = b['/Page']
|
|
|
|
if type(bp) == NumberObject:
|
|
continue
|
|
|
|
for p in pages:
|
|
if bp.getObject() == p.pagedata.getObject():
|
|
pageno = p.id
|
|
|
|
if pageno != None:
|
|
b[NameObject('/Page')] = NumberObject(pageno)
|
|
else:
|
|
raise ValueError, "Unresolved bookmark '%s'" % (b['/Title'],)
|
|
|
|
def findBookmark(self, bookmark, root=None):
|
|
if root == None:
|
|
root = self.bookmarks
|
|
|
|
for i, b in enumerate(root):
|
|
if type(b) == list:
|
|
res = self.findBookmark(bookmark, b)
|
|
if res:
|
|
return [i] + res
|
|
if b == bookmark or b['/Title'] == bookmark:
|
|
return [i]
|
|
|
|
return None
|
|
|
|
def addBookmark(self, title, pagenum, parent=None):
|
|
"""
|
|
Add a bookmark to the pdf, using the specified title and pointing at
|
|
the specified page number. A parent can be specified to make this a
|
|
nested bookmark below the parent.
|
|
"""
|
|
|
|
if parent == None:
|
|
iloc = [len(self.bookmarks)-1]
|
|
elif type(parent) == list:
|
|
iloc = parent
|
|
else:
|
|
iloc = self.findBookmark(parent)
|
|
|
|
dest = Bookmark(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826))
|
|
|
|
if parent == None:
|
|
self.bookmarks.append(dest)
|
|
else:
|
|
bmparent = self.bookmarks
|
|
for i in iloc[:-1]:
|
|
bmparent = bmparent[i]
|
|
npos = iloc[-1]+1
|
|
if npos < len(bmparent) and type(bmparent[npos]) == list:
|
|
bmparent[npos].append(dest)
|
|
else:
|
|
bmparent.insert(npos, [dest])
|
|
|
|
|
|
def addNamedDestination(self, title, pagenum):
|
|
"""
|
|
Add a destination to the pdf, using the specified title and pointing
|
|
at the specified page number.
|
|
"""
|
|
|
|
dest = Destination(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826))
|
|
self.named_dests.append(dest)
|
|
|
|
|
|
class OutlinesObject(list):
|
|
def __init__(self, pdf, tree, parent=None):
|
|
list.__init__(self)
|
|
self.tree = tree
|
|
self.pdf = pdf
|
|
self.parent = parent
|
|
|
|
def remove(self, index):
|
|
obj = self[index]
|
|
del self[index]
|
|
self.tree.removeChild(obj)
|
|
|
|
def add(self, title, page):
|
|
pageRef = self.pdf.getObject(self.pdf._pages)['/Kids'][pagenum]
|
|
action = DictionaryObject()
|
|
action.update({
|
|
NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]),
|
|
NameObject('/S') : NameObject('/GoTo')
|
|
})
|
|
actionRef = self.pdf._addObject(action)
|
|
bookmark = TreeObject()
|
|
|
|
bookmark.update({
|
|
NameObject('/A') : actionRef,
|
|
NameObject('/Title') : createStringObject(title),
|
|
})
|
|
|
|
pdf._addObject(bookmark)
|
|
|
|
self.tree.addChild(bookmark)
|
|
|
|
def removeAll(self):
|
|
for child in [x for x in self.tree.children()]:
|
|
self.tree.removeChild(child)
|
|
self.pop() |