debian-pypdf4/PyPDF2/merger.py

401 lines
13 KiB
Python

# vim: sw=4:expandtab:foldmethod=marker
#
# Copyright (c) 2006, Mathieu Fenniak
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from generic import *
from pdf import PdfFileReader, PdfFileWriter, Destination
class _MergedPage(object):
"""
_MergedPage is used internally by PdfFileMerger to collect necessary information on each page that is being merged.
"""
def __init__(self, pagedata, src, id):
self.src = src
self.pagedata = pagedata
self.out_pagedata = None
self.id = id
class PdfFileMerger(object):
"""
PdfFileMerger merges multiple PDFs into a single PDF. It can concatenate,
slice, insert, or any combination of the above.
See the functions "merge" (or "append") and "write" (or "overwrite") for
usage information.
"""
def __init__(self):
"""
>>> PdfFileMerger()
Initializes a PdfFileMerger, no parameters required
"""
self.inputs = []
self.pages = []
self.output = PdfFileWriter()
self.bookmarks = []
self.named_dests = []
self.id_count = 0
def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=True):
"""
>>> merge(position, file, bookmark=None, pages=None, import_bookmarks=True)
Merges the pages from the source document specified by "file" into the output
file at the page number specified by "position".
Optionally, you may specify a bookmark to be applied at the beginning of the
included file by supplying the text of the bookmark in the "bookmark" parameter.
You may prevent the source document's bookmarks from being imported by
specifying "import_bookmarks" as False.
You may also use the "pages" parameter to merge only the specified range of
pages from the source document into the output document.
"""
my_file = False
if type(fileobj) in (str, unicode):
fileobj = file(fileobj, 'rb')
my_file = True
if type(fileobj) == PdfFileReader:
pdfr = fileobj
fileobj = pdfr.file
else:
pdfr = PdfFileReader(fileobj)
# Find the range of pages to merge
if pages == None:
pages = (0, pdfr.getNumPages())
elif type(pages) in (int, float, str, unicode):
raise TypeError('"pages" must be a tuple of (start, end)')
srcpages = []
if bookmark:
bookmark = Bookmark(TextStringObject(bookmark), NumberObject(self.id_count), NameObject('/Fit'))
outline = []
if import_bookmarks:
outline = pdfr.getOutlines()
outline = self._trim_outline(pdfr, outline, pages)
if bookmark:
self.bookmarks += [bookmark, outline]
else:
self.bookmarks += outline
dests = pdfr.namedDestinations
dests = self._trim_dests(pdfr, dests, pages)
self.named_dests += dests
# Gather all the pages that are going to be merged
for i in range(*pages):
pg = pdfr.getPage(i)
id = self.id_count
self.id_count += 1
mp = _MergedPage(pg, pdfr, id)
srcpages.append(mp)
self._associate_dests_to_pages(srcpages)
self._associate_bookmarks_to_pages(srcpages)
# Slice to insert the pages at the specified position
self.pages[position:position] = srcpages
# Keep track of our input files so we can close them later
self.inputs.append((fileobj, pdfr, my_file))
def append(self, fileobj, bookmark=None, pages=None, import_bookmarks=True):
"""
>>> append(file, bookmark=None, pages=None, import_bookmarks=True):
Identical to the "merge" function, but assumes you want to concatenate all pages
onto the end of the file instead of specifying a position.
"""
self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks)
def write(self, fileobj):
"""
>>> write(file)
Writes all data that has been merged to "file" (which can be a filename or any
kind of file-like object)
"""
my_file = False
if type(fileobj) in (str, unicode):
fileobj = file(fileobj, 'wb')
my_file = True
# Add pages to the PdfFileWriter
for page in self.pages:
self.output.addPage(page.pagedata)
page.out_pagedata = self.output.getReference(self.output._pages.getObject()["/Kids"][-1].getObject())
# Once all pages are added, create bookmarks to point at those pages
self._write_dests()
self._write_bookmarks()
# Write the output to the file
self.output.write(fileobj)
if my_file:
fileobj.close()
def close(self):
"""
>>> close()
Shuts all file descriptors (input and output) and clears all memory usage
"""
self.pages = []
for fo, pdfr, mine in self.inputs:
if mine:
fo.close()
self.inputs = []
self.output = None
def _trim_dests(self, pdf, dests, pages):
"""
Removes any named destinations that are not a part of the specified page set
"""
new_dests = []
prev_header_added = True
for k, o in dests.items():
for j in range(*pages):
if pdf.getPage(j).getObject() == o['/Page'].getObject():
o[NameObject('/Page')] = o['/Page'].getObject()
assert str(k) == str(o['/Title'])
new_dests.append(o)
break
return new_dests
def _trim_outline(self, pdf, outline, pages):
"""
Removes any outline/bookmark entries that are not a part of the specified page set
"""
new_outline = []
prev_header_added = True
for i, o in enumerate(outline):
if type(o) == list:
sub = self._trim_outline(pdf, o, pages)
if sub:
if not prev_header_added:
new_outline.append(outline[i-1])
new_outline.append(sub)
else:
prev_header_added = False
for j in range(*pages):
if pdf.getPage(j).getObject() == o['/Page'].getObject():
o[NameObject('/Page')] = o['/Page'].getObject()
new_outline.append(o)
prev_header_added = True
break
return new_outline
def _write_dests(self):
dests = self.named_dests
for v in dests:
pageno = None
pdf = None
if v.has_key('/Page'):
for i, p in enumerate(self.pages):
if p.id == v['/Page']:
v[NameObject('/Page')] = p.out_pagedata
pageno = i
pdf = p.src
if pageno != None:
self.output.addNamedDestinationObject(v)
def _write_bookmarks(self, bookmarks=None, parent=None):
if bookmarks == None:
bookmarks = self.bookmarks
last_added = None
for b in bookmarks:
if type(b) == list:
self._write_bookmarks(b, last_added)
continue
pageno = None
pdf = None
if b.has_key('/Page'):
for i, p in enumerate(self.pages):
if p.id == b['/Page']:
b[NameObject('/Page')] = p.out_pagedata
pageno = i
pdf = p.src
if pageno != None:
last_added = self.output.addBookmarkDestination(b, parent)
def _associate_dests_to_pages(self, pages):
for nd in self.named_dests:
pageno = None
np = nd['/Page']
if type(np) == NumberObject:
continue
for p in pages:
if np.getObject() == p.pagedata.getObject():
pageno = p.id
if pageno != None:
nd[NameObject('/Page')] = NumberObject(pageno)
else:
raise ValueError, "Unresolved named destination '%s'" % (nd['/Title'],)
def _associate_bookmarks_to_pages(self, pages, bookmarks=None):
if bookmarks == None:
bookmarks = self.bookmarks
for b in bookmarks:
if type(b) == list:
self._associate_bookmarks_to_pages(pages, b)
continue
pageno = None
bp = b['/Page']
if type(bp) == NumberObject:
continue
for p in pages:
if bp.getObject() == p.pagedata.getObject():
pageno = p.id
if pageno != None:
b[NameObject('/Page')] = NumberObject(pageno)
else:
raise ValueError, "Unresolved bookmark '%s'" % (b['/Title'],)
def findBookmark(self, bookmark, root=None):
if root == None:
root = self.bookmarks
for i, b in enumerate(root):
if type(b) == list:
res = self.findBookmark(bookmark, b)
if res:
return [i] + res
if b == bookmark or b['/Title'] == bookmark:
return [i]
return None
def addBookmark(self, title, pagenum, parent=None):
"""
Add a bookmark to the pdf, using the specified title and pointing at
the specified page number. A parent can be specified to make this a
nested bookmark below the parent.
"""
if parent == None:
iloc = [len(self.bookmarks)-1]
elif type(parent) == list:
iloc = parent
else:
iloc = self.findBookmark(parent)
dest = Bookmark(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826))
if parent == None:
self.bookmarks.append(dest)
else:
bmparent = self.bookmarks
for i in iloc[:-1]:
bmparent = bmparent[i]
npos = iloc[-1]+1
if npos < len(bmparent) and type(bmparent[npos]) == list:
bmparent[npos].append(dest)
else:
bmparent.insert(npos, [dest])
def addNamedDestination(self, title, pagenum):
"""
Add a destination to the pdf, using the specified title and pointing
at the specified page number.
"""
dest = Destination(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826))
self.named_dests.append(dest)
class OutlinesObject(list):
def __init__(self, pdf, tree, parent=None):
list.__init__(self)
self.tree = tree
self.pdf = pdf
self.parent = parent
def remove(self, index):
obj = self[index]
del self[index]
self.tree.removeChild(obj)
def add(self, title, page):
pageRef = self.pdf.getObject(self.pdf._pages)['/Kids'][pagenum]
action = DictionaryObject()
action.update({
NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]),
NameObject('/S') : NameObject('/GoTo')
})
actionRef = self.pdf._addObject(action)
bookmark = TreeObject()
bookmark.update({
NameObject('/A') : actionRef,
NameObject('/Title') : createStringObject(title),
})
pdf._addObject(bookmark)
self.tree.addChild(bookmark)
def removeAll(self):
for child in [x for x in self.tree.children()]:
self.tree.removeChild(child)
self.pop()