Add addJS method to PdfFileWriter

This commit is contained in:
Vitor Figueiro 2014-07-02 15:50:22 +01:00
parent 6e448ae1c9
commit f71e4979fd
2 changed files with 106 additions and 86 deletions

View File

@ -106,7 +106,8 @@ class PdfFileWriter(object):
NameObject("/Type"): NameObject("/Catalog"),
NameObject("/Pages"): self._pages,
})
self._root = self._addObject(root)
self._root = None
self.root = root
def _addObject(self, obj):
self._objects.append(obj)
@ -209,6 +210,17 @@ class PdfFileWriter(object):
self.insertPage(page, index)
return page
def addJS(self, javascript):
js = DictionaryObject()
js.update({
NameObject("/Type"): NameObject("/Action"),
NameObject("/S"): NameObject("/JavaScript"),
NameObject("/JS"): NameObject("(%s)" % javascript)
})
self.root.update({
NameObject("/OpenAction"): self._addObject(js)
})
def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
"""
Encrypt this PDF file with the PDF Standard encryption handler.
@ -268,6 +280,9 @@ class PdfFileWriter(object):
debug = False
import struct
if(not self._root):
self._root = self._addObject(self.root)
externalReferenceMap = {}
# PDF objects sometimes have circular references to their /Page objects
@ -333,7 +348,7 @@ class PdfFileWriter(object):
if hasattr(self, "_encrypt"):
trailer[NameObject("/Encrypt")] = self._encrypt
trailer.writeToStream(stream, None)
# eof
stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location)))
@ -399,13 +414,13 @@ class PdfFileWriter(object):
return newobj
else:
return data
def getReference(self, obj):
idnum = self._objects.index(obj) + 1
ref = IndirectObject(idnum, 0, self)
assert ref.getObject() == obj
return ref
def getOutlineRoot(self):
root = self.getObject(self._root)
@ -413,15 +428,15 @@ class PdfFileWriter(object):
outline = root['/Outlines']
idnum = self._objects.index(outline) + 1
outlineRef = IndirectObject(idnum, 0, self)
assert outlineRef.getObject() == outline
assert outlineRef.getObject() == outline
else:
outline = TreeObject()
outline = TreeObject()
outline.update({ })
outlineRef = self._addObject(outline)
root[NameObject('/Outlines')] = outlineRef
return outline
def getNamedDestRoot(self):
root = self.getObject(self._root)
@ -429,12 +444,12 @@ class PdfFileWriter(object):
names = root['/Names']
idnum = self._objects.index(names) + 1
namesRef = IndirectObject(idnum, 0, self)
assert namesRef.getObject() == names
assert namesRef.getObject() == names
if '/Dests' in names and isinstance(names['/Dests'], DictionaryObject):
dests = names['/Dests']
idnum = self._objects.index(dests) + 1
destsRef = IndirectObject(idnum, 0, self)
assert destsRef.getObject() == dests
assert destsRef.getObject() == dests
if '/Names' in dests:
nd = dests['/Names']
else:
@ -446,7 +461,7 @@ class PdfFileWriter(object):
names[NameObject('/Dests')] = destsRef
nd = ArrayObject()
dests[NameObject('/Names')] = nd
else:
names = DictionaryObject()
namesRef = self._addObject(names)
@ -456,49 +471,49 @@ class PdfFileWriter(object):
names[NameObject('/Dests')] = destsRef
nd = ArrayObject()
dests[NameObject('/Names')] = nd
return nd
def addBookmarkDestination(self, dest, parent=None):
destRef = self._addObject(dest)
outlineRef = self.getOutlineRoot()
if parent == None:
parent = outlineRef
parent = parent.getObject()
#print parent.__class__.__name__
parent.addChild(destRef, self)
return destRef
def addBookmarkDict(self, bookmark, parent=None):
bookmarkObj = TreeObject()
for k, v in list(bookmark.items()):
bookmarkObj[NameObject(str(k))] = v
bookmarkObj.update(bookmark)
if '/A' in bookmark:
action = DictionaryObject()
for k, v in list(bookmark['/A'].items()):
action[NameObject(str(k))] = v
actionRef = self._addObject(action)
bookmarkObj[NameObject('/A')] = actionRef
bookmarkRef = self._addObject(bookmarkObj)
outlineRef = self.getOutlineRoot()
if parent == None:
parent = outlineRef
parent = parent.getObject()
parent.addChild(bookmarkRef, self)
return bookmarkRef
return bookmarkRef
def addBookmark(self, title, pagenum, parent=None):
"""
Add a bookmark to this PDF file.
@ -517,10 +532,10 @@ class PdfFileWriter(object):
actionRef = self._addObject(action)
outlineRef = self.getOutlineRoot()
if parent == None:
parent = outlineRef
bookmark = TreeObject()
@ -530,10 +545,10 @@ class PdfFileWriter(object):
})
bookmarkRef = self._addObject(bookmark)
parent = parent.getObject()
parent.addChild(bookmarkRef, self)
return bookmarkRef
def addNamedDestinationObject(self, dest):
@ -541,8 +556,8 @@ class PdfFileWriter(object):
nd = self.getNamedDestRoot()
nd.extend([dest['/Title'], destRef])
return destRef
return destRef
def addNamedDestination(self, title, pagenum):
pageRef = self.getObject(self._pages)['/Kids'][pagenum]
@ -551,12 +566,12 @@ class PdfFileWriter(object):
NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]),
NameObject('/S') : NameObject('/GoTo')
})
destRef = self._addObject(dest)
nd = self.getNamedDestRoot()
nd.extend([title, destRef])
return destRef
def removeLinks(self):
@ -714,7 +729,7 @@ class PdfFileWriter(object):
borderArr.append(dashPattern)
else:
borderArr = [NumberObject(0)] * 3
if isinstance(rect, Str):
rect = NameObject(rect)
elif isinstance(rect, RectangleObject):
@ -739,12 +754,12 @@ class PdfFileWriter(object):
pageRef[NameObject('/Annots')] = ArrayObject([lnkRef])
_valid_layouts = ['/NoLayout', '/SinglePage', '/OneColumn', '/TwoColumnLeft', '/TwoColumnRight', '/TwoPageLeft', '/TwoPageRight']
def getPageLayout(self):
"""
Get the page layout.
See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` for a description of valid layouts.
:return: Page layout currently being used.
:rtype: str, None if not specified
"""
@ -752,13 +767,13 @@ class PdfFileWriter(object):
return self.getObject(self._root)['/PageLayout']
except KeyError:
return None
def setPageLayout(self, layout):
"""
Set the page layout
:param str layout: The page layout to be used
Valid layouts are:
/NoLayout Layout explicitly not specified
/SinglePage Show one page at a time
@ -774,7 +789,7 @@ class PdfFileWriter(object):
layout = NameObject(layout)
root = self.getObject(self._root)
root.update({NameObject('/PageLayout'): layout})
pageLayout = property(getPageLayout, setPageLayout)
"""Read and write property accessing the :meth:`getPageLayout()<PdfFileWriter.getPageLayout>`
and :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` methods."""
@ -786,7 +801,7 @@ class PdfFileWriter(object):
Get the page mode.
See :meth:`setPageMode()<PdfFileWriter.setPageMode>` for a description
of valid modes.
:return: Page mode currently being used.
:rtype: str, None if not specified
"""
@ -800,7 +815,7 @@ class PdfFileWriter(object):
Set the page mode.
:param str mode: The page mode to use.
Valid modes are:
/UseNone Do not show outlines or thumbnails panels
/UseOutlines Show outlines (aka bookmarks) panel
@ -815,7 +830,7 @@ class PdfFileWriter(object):
mode = NameObject(mode)
root = self.getObject(self._root)
root.update({NameObject('/PageMode'): mode})
pageMode = property(getPageMode, setPageMode)
"""Read and write property accessing the :meth:`getPageMode()<PdfFileWriter.getPageMode>`
and :meth:`setPageMode()<PdfFileWriter.setPageMode>` methods."""
@ -915,8 +930,8 @@ class PdfFileReader(object):
:raises PdfReadError: if file is encrypted and restrictions prevent
this action.
"""
# Flattened pages will not work on an Encrypted PDF;
# Flattened pages will not work on an Encrypted PDF;
# the PDF file's page count is used in this case. Otherwise,
# the original method (flattened page count) is used.
if self.isEncrypted:
@ -971,7 +986,7 @@ class PdfFileReader(object):
if retval == None:
retval = {}
catalog = self.trailer["/Root"]
# get the name tree
if "/Dests" in catalog:
tree = catalog["/Dests"]
@ -979,7 +994,7 @@ class PdfFileReader(object):
names = catalog['/Names']
if "/Dests" in names:
tree = names['/Dests']
if tree == None:
return retval
@ -1016,17 +1031,17 @@ class PdfFileReader(object):
if outlines == None:
outlines = []
catalog = self.trailer["/Root"]
# get the outline dictionary and named destinations
if "/Outlines" in catalog:
lines = catalog["/Outlines"]
if "/First" in lines:
node = lines["/First"]
self._namedDests = self.getNamedDestinations()
if node == None:
return outlines
# see if there are any more outlines
while True:
outline = self._buildOutline(node)
@ -1050,10 +1065,10 @@ class PdfFileReader(object):
page, typ = array[0:2]
array = array[2:]
return Destination(title, page, typ, *array)
def _buildOutline(self, node):
dest, title, outline = None, None, None
if "/A" in node and "/Title" in node:
# Action, section 8.5 (only type GoTo supported)
title = node["/Title"]
@ -1097,7 +1112,7 @@ class PdfFileReader(object):
return self.trailer['/Root']['/PageLayout']
except KeyError:
return None
pageLayout = property(getPageLayout)
"""Read-only property accessing the
:meth:`getPageLayout()<PdfFileReader.getPageLayout>` method."""
@ -1107,7 +1122,7 @@ class PdfFileReader(object):
Get the page mode.
See :meth:`setPageMode()<PdfFileWriter.setPageMode>`
for a description of valid modes.
:return: Page mode currently being used.
:rtype: ``str``, ``None`` if not specified
"""
@ -1197,20 +1212,20 @@ class PdfFileReader(object):
warnings.warn("Invalid stream (index %d) within object %d %d: %s" % \
(i, indirectReference.idnum, indirectReference.generation, e), utils.PdfReadWarning)
if self.strict:
if self.strict:
raise utils.PdfReadError("Can't read object stream: %s"%e)
# Replace with null. Hopefully it's nothing important.
obj = NullObject()
return obj
if self.strict: raise utils.PdfReadError("This is a fatal error in strict mode.")
return NullObject()
def getObject(self, indirectReference):
debug = False
if debug: print(("looking at:", indirectReference.idnum, indirectReference.generation))
retval = self.cacheGetIndirectObject(indirectReference.generation,
retval = self.cacheGetIndirectObject(indirectReference.generation,
indirectReference.idnum)
if retval != None:
return retval
@ -1225,11 +1240,11 @@ class PdfFileReader(object):
idnum, generation = self.readObjectHeader(self.stream)
if idnum != indirectReference.idnum and self.xrefIndex:
# Xref table probably had bad indexes due to not being zero-indexed
if self.strict:
if self.strict:
raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d); xref table not zero-indexed." \
% (indirectReference.idnum, indirectReference.generation, idnum, generation))
else: pass # xref table is corrected in non-strict mode
elif idnum != indirectReference.idnum:
elif idnum != indirectReference.idnum:
# some other problem
raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d)." \
% (indirectReference.idnum, indirectReference.generation, idnum, generation))
@ -1253,9 +1268,9 @@ class PdfFileReader(object):
else:
warnings.warn("Object %d %d not defined."%(indirectReference.idnum,
indirectReference.generation), utils.PdfReadWarning)
#if self.strict:
#if self.strict:
raise utils.PdfReadError("Could not find object.")
self.cacheIndirectObject(indirectReference.generation,
self.cacheIndirectObject(indirectReference.generation,
indirectReference.idnum, retval)
return retval
@ -1286,7 +1301,7 @@ class PdfFileReader(object):
obj = stream.read(3)
readNonWhitespace(stream)
stream.seek(-1, 1)
if (extra and self.strict):
if (extra and self.strict):
#not a fatal error
warnings.warn("Superfluous whitespace found in object header %s %s" % \
(idnum, generation), utils.PdfReadWarning)
@ -1298,7 +1313,7 @@ class PdfFileReader(object):
if debug and out: print(("cache hit: %d %d"%(idnum, generation)))
elif debug: print(("cache miss: %d %d"%(idnum, generation)))
return out
def cacheIndirectObject(self, generation, idnum, obj):
# return None # Sometimes we want to turn off cache for debugging.
if (generation, idnum) in self.resolvedObjects:
@ -1371,17 +1386,17 @@ class PdfFileReader(object):
cnt = 0
while cnt < size:
line = stream.read(20)
# It's very clear in section 3.4.3 of the PDF spec
# that all cross-reference table lines are a fixed
# 20 bytes (as of PDF 1.7). However, some files have
# 21-byte entries (or more) due to the use of \r\n
# (CRLF) EOL's. Detect that case, and adjust the line
# (CRLF) EOL's. Detect that case, and adjust the line
# until it does not begin with a \r (CR) or \n (LF).
while line[0] in b_("\x0D\x0A"):
stream.seek(-20 + 1, 1)
line = stream.read(20)
# On the other hand, some malformed PDF files
# use a single character EOL without a preceeding
# space. Detect that case, and seek the stream
@ -1390,7 +1405,7 @@ class PdfFileReader(object):
# text "trailer"):
if line[-1] in b_("0123456789t"):
stream.seek(-1, 1)
offset, generation = line[:16].split(b_(" "))
offset, generation = int(offset), int(generation)
if generation not in self.xref:
@ -1431,7 +1446,7 @@ class PdfFileReader(object):
assert xrefstream["/Type"] == "/XRef"
self.cacheIndirectObject(generation, idnum, xrefstream)
streamData = BytesIO(b_(xrefstream.getData()))
# Index pairs specify the subsections in the dictionary. If
# Index pairs specify the subsections in the dictionary. If
# none create one subsection that spans everything.
idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
if debug: print(("read idx_pairs=%s"%list(self._pairs(idx_pairs))))
@ -1445,17 +1460,17 @@ class PdfFileReader(object):
if entrySizes[i] > 0:
d = streamData.read(entrySizes[i])
return convertToInt(d, entrySizes[i])
# PDF Spec Table 17: A value of zero for an element in the
# PDF Spec Table 17: A value of zero for an element in the
# W array indicates...the default value shall be used
if i == 0: return 1 # First value defaults to 1
else: return 0
def used_before(num, generation):
# We move backwards through the xrefs, don't replace any.
return num in self.xref.get(generation, []) or \
num in self.xref_objStm
# Iterate through each subsection
last_end = 0
for start, size in self._pairs(idx_pairs):
@ -1492,7 +1507,7 @@ class PdfFileReader(object):
elif self.strict:
raise utils.PdfReadError("Unknown xref type: %s"%
xref_type)
trailerKeys = "/Root", "/Encrypt", "/Info", "/ID"
for key in trailerKeys:
if key in xrefstream and key not in self.trailer:
@ -1542,10 +1557,10 @@ class PdfFileReader(object):
#if not, then either it's just plain wrong, or the non-zero-index is actually correct
stream.seek(loc, 0) #return to where it was
def _zeroXref(self, generation):
self.xref[generation] = dict( (k-self.xrefIndex, v) for (k, v) in list(self.xref[generation].items()) )
def _pairs(self, array):
i = 0
while True:
@ -1810,7 +1825,7 @@ class PageObject(DictionaryObject):
def _pushPopGS(contents, pdf):
# adds a graphics state "push" and "pop" to the beginning and end
# of a content stream. This isolates it from changes such as
# of a content stream. This isolates it from changes such as
# transformation matricies.
stream = ContentStream(contents, pdf)
stream.operations.insert(0, [[], "q"])
@ -1892,12 +1907,12 @@ class PageObject(DictionaryObject):
page2Content, rename, self.pdf)
page2Content = PageObject._pushPopGS(page2Content, self.pdf)
newContentArray.append(page2Content)
# if expanding the page to fit a new page, calculate the new media box size
if expand:
corners1 = [self.mediaBox.getLowerLeft_x().as_numeric(), self.mediaBox.getLowerLeft_y().as_numeric(),
corners1 = [self.mediaBox.getLowerLeft_x().as_numeric(), self.mediaBox.getLowerLeft_y().as_numeric(),
self.mediaBox.getUpperRight_x().as_numeric(), self.mediaBox.getUpperRight_y().as_numeric()]
corners2 = [page2.mediaBox.getLowerLeft_x().as_numeric(), page2.mediaBox.getLowerLeft_y().as_numeric(),
corners2 = [page2.mediaBox.getLowerLeft_x().as_numeric(), page2.mediaBox.getLowerLeft_y().as_numeric(),
page2.mediaBox.getUpperLeft_x().as_numeric(), page2.mediaBox.getUpperLeft_y().as_numeric(),
page2.mediaBox.getUpperRight_x().as_numeric(), page2.mediaBox.getUpperRight_y().as_numeric(),
page2.mediaBox.getLowerRight_x().as_numeric(), page2.mediaBox.getLowerRight_y().as_numeric()]
@ -2554,24 +2569,24 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
# described in Algorithm 3.2.
key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
# 2. Initialize the MD5 hash function and pass the 32-byte padding string
# shown in step 1 of Algorithm 3.2 as input to this function.
# shown in step 1 of Algorithm 3.2 as input to this function.
m = md5()
m.update(_encryption_padding)
# 3. Pass the first element of the file's file identifier array (the value
# of the ID entry in the document's trailer dictionary; see Table 3.13 on
# page 73) to the hash function and finish the hash. (See implementation
# note 25 in Appendix H.)
# note 25 in Appendix H.)
m.update(id1_entry.original_bytes)
md5_hash = m.digest()
# 4. Encrypt the 16-byte result of the hash, using an RC4 encryption
# function with the encryption key from step 1.
# function with the encryption key from step 1.
val = utils.RC4_encrypt(key, md5_hash)
# 5. Do the following 19 times: Take the output from the previous
# invocation of the RC4 function and pass it as input to a new invocation
# of the function; use an encryption key generated by taking each byte of
# the original encryption key (obtained in step 2) and performing an XOR
# operation between that byte and the single-byte value of the iteration
# counter (from 1 to 19).
# counter (from 1 to 19).
for i in range(1, 20):
new_key = b_('')
for l in range(len(key)):
@ -2579,7 +2594,7 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
val = utils.RC4_encrypt(new_key, val)
# 6. Append 16 bytes of arbitrary padding to the output from the final
# invocation of the RC4 function and store the 32-byte result as the value
# of the U entry in the encryption dictionary.
# of the U entry in the encryption dictionary.
# (implementator note: I don't know what "arbitrary padding" is supposed to
# mean, so I have used null bytes. This seems to match a few other
# people's implementations)

View File

@ -2,7 +2,7 @@ from PyPDF2 import PdfFileWriter, PdfFileReader
output = PdfFileWriter()
input1 = PdfFileReader(open("document1.pdf", "rb"))
# print how many pages input1 has:
print "document1.pdf has %d pages." % input1.getNumPages()
@ -21,7 +21,7 @@ page4 = input1.getPage(3)
watermark = PdfFileReader(open("watermark.pdf", "rb"))
page4.mergePage(watermark.getPage(0))
output.addPage(page4)
# add page 5 from input1, but crop it to half size:
page5 = input1.getPage(4)
@ -31,6 +31,11 @@ page5.mediaBox.upperRight = (
)
output.addPage(page5)
# add some Javascript to launch the print window on opening this PDF.
# the password dialog may prevent the print dialog from being shown,
# comment the the encription lines, if that's the case, to try this out
output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
# encrypt your new PDF and add a password
password = "secret"
output.encrypt(password)