commit c59a212a4cda512c184a987312970b464404d17b Author: Adam Coleman Date: Fri Dec 30 09:04:56 2011 -0600 First commit Original PyPDF code. Updates should be coming from Noah soon. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c9b568f --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.pyc +*.swp diff --git a/CHANGELOG b/CHANGELOG new file mode 100644 index 0000000..d426897 --- /dev/null +++ b/CHANGELOG @@ -0,0 +1,205 @@ +Version 1.12, 2008-09-02 +------------------------ + + - Added support for XMP metadata. + + - Fix reading files with xref streams with multiple /Index values. + + - Fix extracting content streams that use graphics operators longer than 2 + characters. Affects merging PDF files. + + +Version 1.11, 2008-05-09 +------------------------ + + - Patch from Hartmut Goebel to permit RectangleObjects to accept NumberObject + or FloatObject values. + + - PDF compatibility fixes. + + - Fix to read object xref stream in correct order. + + - Fix for comments inside content streams. + + +Version 1.10, 2007-10-04 +------------------------ + + - Text strings from PDF files are returned as Unicode string objects when + pyPdf determines that they can be decoded (as UTF-16 strings, or as + PDFDocEncoding strings). Unicode objects are also written out when + necessary. This means that string objects in pyPdf can be either + generic.ByteStringObject instances, or generic.TextStringObject instances. + + - The extractText method now returns a unicode string object. + + - All document information properties now return unicode string objects. In + the event that a document provides docinfo properties that are not decoded by + pyPdf, the raw byte strings can be accessed with an "_raw" property (ie. + title_raw rather than title) + + - generic.DictionaryObject instances have been enhanced to be easier to use. + Values coming out of dictionary objects will automatically be de-referenced + (.getObject will be called on them), unless accessed by the new "raw_get" + method. DictionaryObjects can now only contain PdfObject instances (as keys + and values), making it easier to debug where non-PdfObject values (which + cannot be written out) are entering dictionaries. + + - Support for reading named destinations and outlines in PDF files. Original + patch by Ashish Kulkarni. + + - Stream compatibility reading enhancements for malformed PDF files. + + - Cross reference table reading enhancements for malformed PDF files. + + - Encryption documentation. + + - Replace some "assert" statements with error raising. + + - Minor optimizations to FlateDecode algorithm increase speed when using PNG + predictors. + +Version 1.9, 2006-12-15 +----------------------- + + - Fix several serious bugs introduced in version 1.8, caused by a failure to + run through our PDF test suite before releasing that version. + + - Fix bug in NullObject reading and writing. + +Version 1.8, 2006-12-14 +----------------------- + + - Add support for decryption with the standard PDF security handler. This + allows for decrypting PDF files given the proper user or owner password. + + - Add support for encryption with the standard PDF security handler. + + - Add new pythondoc documentation. + + - Fix bug in ASCII85 decode that occurs when whitespace exists inside the + two terminating characters of the stream. + +Version 1.7, 2006-12-10 +----------------------- + + - Fix a bug when using a single page object in two PdfFileWriter objects. + + - Adjust PyPDF to be tolerant of whitespace characters that don't belong + during a stream object. + + - Add documentInfo property to PdfFileReader. + + - Add numPages property to PdfFileReader. + + - Add pages property to PdfFileReader. + + - Add extractText function to PdfFileReader. + + +Version 1.6, 2006-06-06 +----------------------- + + - Add basic support for comments in PDF files. This allows us to read some + ReportLab PDFs that could not be read before. + + - Add "auto-repair" for finding xref table at slightly bad locations. + + - New StreamObject backend, cleaner and more powerful. Allows the use of + stream filters more easily, including compressed streams. + + - Add a graphics state push/pop around page merges. Improves quality of + page merges when one page's content stream leaves the graphics + in an abnormal state. + + - Add PageObject.compressContentStreams function, which filters all content + streams and compresses them. This will reduce the size of PDF pages, + especially after they could have been decompressed in a mergePage + operation. + + - Support inline images in PDF content streams. + + - Add support for using .NET framework compression when zlib is not + available. This does not make pyPdf compatible with IronPython, but it + is a first step. + + - Add support for reading the document information dictionary, and extracting + title, author, subject, producer and creator tags. + + - Add patch to support NullObject and multiple xref streams, from Bradley + Lawrence. + + +Version 1.5, 2006-01-28 +----------------------- + +- Fix a bug where merging pages did not work in "no-rename" cases when the + second page has an array of content streams. + +- Remove some debugging output that should not have been present. + + +Version 1.4, 2006-01-27 +----------------------- + +- Add capability to merge pages from multiple PDF files into a single page + using the PageObject.mergePage function. See example code (README or web + site) for more information. + +- Add ability to modify a page's MediaBox, CropBox, BleedBox, TrimBox, and + ArtBox properties through PageObject. See example code (README or web site) + for more information. + +- Refactor pdf.py into multiple files: generic.py (contains objects like + NameObject, DictionaryObject), filters.py (contains filter code), + utils.py (various). This does not affect importing PdfFileReader + or PdfFileWriter. + +- Add new decoding functions for standard PDF filters ASCIIHexDecode and + ASCII85Decode. + +- Change url and download_url to refer to new pybrary.net web site. + + +Version 1.3, 2006-01-23 +----------------------- + +- Fix new bug introduced in 1.2 where PDF files with \r line endings did not + work properly anymore. A new test suite developed with various PDF files + should prevent regression bugs from now on. + +- Fix a bug where inheriting attributes from page nodes did not work. + + +Version 1.2, 2006-01-23 +----------------------- + +- Improved support for files with CRLF-based line endings, fixing a common + reported problem stating "assertion error: assert line == "%%EOF"". + +- Software author/maintainer is now officially a proud married person, which + is sure to result in better software... somehow. + + +Version 1.1, 2006-01-18 +----------------------- + +- Add capability to rotate pages. + +- Improved PDF reading support to properly manage inherited attributes from + /Type=/Pages nodes. This means that page groups that are rotated or have + different media boxes or whatever will now work properly. + +- Added PDF 1.5 support. Namely cross-reference streams and object streams. + This release can mangle Adobe's PDFReference16.pdf successfully. + + +Version 1.0, 2006-01-17 +----------------------- + +- First distutils-capable true public release. Supports a wide variety of PDF + files that I found sitting around on my system. + +- Does not support some PDF 1.5 features, such as object streams, + cross-reference streams. + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e058995 --- /dev/null +++ b/LICENSE @@ -0,0 +1,28 @@ +Copyright (c) 2006-2008, Mathieu Fenniak +Some contributions copyright (c) 2007, Ashish Kulkarni + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. +* The name of the author may not be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..f7aec42 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include CHANGELOG diff --git a/PyPDF2/__init__.py b/PyPDF2/__init__.py new file mode 100644 index 0000000..f4a6100 --- /dev/null +++ b/PyPDF2/__init__.py @@ -0,0 +1,4 @@ +from pdf import PdfFileReader, PdfFileWriter +from merger import PdfFileMerger + +__all__ = ["pdf", "PdfFileMerger"] diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py new file mode 100644 index 0000000..01e39d1 --- /dev/null +++ b/PyPDF2/filters.py @@ -0,0 +1,252 @@ +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +""" +Implementation of stream filters for PDF. +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +from utils import PdfReadError +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + +try: + import zlib + def decompress(data): + return zlib.decompress(data) + def compress(data): + return zlib.compress(data) +except ImportError: + # Unable to import zlib. Attempt to use the System.IO.Compression + # library from the .NET framework. (IronPython only) + import System + from System import IO, Collections, Array + def _string_to_bytearr(buf): + retval = Array.CreateInstance(System.Byte, len(buf)) + for i in range(len(buf)): + retval[i] = ord(buf[i]) + return retval + def _bytearr_to_string(bytes): + retval = "" + for i in range(bytes.Length): + retval += chr(bytes[i]) + return retval + def _read_bytes(stream): + ms = IO.MemoryStream() + buf = Array.CreateInstance(System.Byte, 2048) + while True: + bytes = stream.Read(buf, 0, buf.Length) + if bytes == 0: + break + else: + ms.Write(buf, 0, bytes) + retval = ms.ToArray() + ms.Close() + return retval + def decompress(data): + bytes = _string_to_bytearr(data) + ms = IO.MemoryStream() + ms.Write(bytes, 0, bytes.Length) + ms.Position = 0 # fseek 0 + gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Decompress) + bytes = _read_bytes(gz) + retval = _bytearr_to_string(bytes) + gz.Close() + return retval + def compress(data): + bytes = _string_to_bytearr(data) + ms = IO.MemoryStream() + gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Compress, True) + gz.Write(bytes, 0, bytes.Length) + gz.Close() + ms.Position = 0 # fseek 0 + bytes = ms.ToArray() + retval = _bytearr_to_string(bytes) + ms.Close() + return retval + + +class FlateDecode(object): + def decode(data, decodeParms): + data = decompress(data) + predictor = 1 + if decodeParms: + predictor = decodeParms.get("/Predictor", 1) + # predictor 1 == no predictor + if predictor != 1: + columns = decodeParms["/Columns"] + # PNG prediction: + if predictor >= 10 and predictor <= 15: + output = StringIO() + # PNG prediction can vary from row to row + rowlength = columns + 1 + assert len(data) % rowlength == 0 + prev_rowdata = (0,) * rowlength + for row in xrange(len(data) / rowlength): + rowdata = [ord(x) for x in data[(row*rowlength):((row+1)*rowlength)]] + filterByte = rowdata[0] + if filterByte == 0: + pass + elif filterByte == 1: + for i in range(2, rowlength): + rowdata[i] = (rowdata[i] + rowdata[i-1]) % 256 + elif filterByte == 2: + for i in range(1, rowlength): + rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256 + else: + # unsupported PNG filter + raise PdfReadError("Unsupported PNG filter %r" % filterByte) + prev_rowdata = rowdata + output.write(''.join([chr(x) for x in rowdata[1:]])) + data = output.getvalue() + else: + # unsupported predictor + raise PdfReadError("Unsupported flatedecode predictor %r" % predictor) + return data + decode = staticmethod(decode) + + def encode(data): + return compress(data) + encode = staticmethod(encode) + +class ASCIIHexDecode(object): + def decode(data, decodeParms=None): + retval = "" + char = "" + x = 0 + while True: + c = data[x] + if c == ">": + break + elif c.isspace(): + x += 1 + continue + char += c + if len(char) == 2: + retval += chr(int(char, base=16)) + char = "" + x += 1 + assert char == "" + return retval + decode = staticmethod(decode) + +class ASCII85Decode(object): + def decode(data, decodeParms=None): + retval = "" + group = [] + x = 0 + hitEod = False + # remove all whitespace from data + data = [y for y in data if not (y in ' \n\r\t')] + while not hitEod: + c = data[x] + if len(retval) == 0 and c == "<" and data[x+1] == "~": + x += 2 + continue + #elif c.isspace(): + # x += 1 + # continue + elif c == 'z': + assert len(group) == 0 + retval += '\x00\x00\x00\x00' + continue + elif c == "~" and data[x+1] == ">": + if len(group) != 0: + # cannot have a final group of just 1 char + assert len(group) > 1 + cnt = len(group) - 1 + group += [ 85, 85, 85 ] + hitEod = cnt + else: + break + else: + c = ord(c) - 33 + assert c >= 0 and c < 85 + group += [ c ] + if len(group) >= 5: + b = group[0] * (85**4) + \ + group[1] * (85**3) + \ + group[2] * (85**2) + \ + group[3] * 85 + \ + group[4] + assert b < (2**32 - 1) + c4 = chr((b >> 0) % 256) + c3 = chr((b >> 8) % 256) + c2 = chr((b >> 16) % 256) + c1 = chr(b >> 24) + retval += (c1 + c2 + c3 + c4) + if hitEod: + retval = retval[:-4+hitEod] + group = [] + x += 1 + return retval + decode = staticmethod(decode) + +def decodeStreamData(stream): + from generic import NameObject + filters = stream.get("/Filter", ()) + if len(filters) and not isinstance(filters[0], NameObject): + # we have a single filter instance + filters = (filters,) + data = stream._data + for filterType in filters: + if filterType == "/FlateDecode": + data = FlateDecode.decode(data, stream.get("/DecodeParms")) + elif filterType == "/ASCIIHexDecode": + data = ASCIIHexDecode.decode(data) + elif filterType == "/ASCII85Decode": + data = ASCII85Decode.decode(data) + elif filterType == "/Crypt": + decodeParams = stream.get("/DecodeParams", {}) + if "/Name" not in decodeParams and "/Type" not in decodeParams: + pass + else: + raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet") + else: + # unsupported filter + raise NotImplementedError("unsupported filter %s" % filterType) + return data + +if __name__ == "__main__": + assert "abc" == ASCIIHexDecode.decode('61\n626\n3>') + + ascii85Test = """ + <~9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKFCj@.4Gp$d7F!,L7@<6@)/0JDEF@3BB/F*&OCAfu2/AKY + i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF-FD5W8ARlolDIa + l(DIduD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~> + """ + ascii85_originalText="Man is distinguished, not only by his reason, but by this singular passion from other animals, which is a lust of the mind, that by a perseverance of delight in the continued and indefatigable generation of knowledge, exceeds the short vehemence of any carnal pleasure." + assert ASCII85Decode.decode(ascii85Test) == ascii85_originalText + diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py new file mode 100644 index 0000000..efc6486 --- /dev/null +++ b/PyPDF2/generic.py @@ -0,0 +1,1047 @@ +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +""" +Implementation of generic PDF objects (dictionary, number, string, and so on) +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +import re +from utils import readNonWhitespace, RC4_encrypt +import filters +import utils +import decimal +import codecs + +def readObject(stream, pdf): + tok = stream.read(1) + stream.seek(-1, 1) # reset to start + if tok == 't' or tok == 'f': + # boolean object + return BooleanObject.readFromStream(stream) + elif tok == '(': + # string object + return readStringFromStream(stream) + elif tok == '/': + # name object + return NameObject.readFromStream(stream) + elif tok == '[': + # array object + return ArrayObject.readFromStream(stream, pdf) + elif tok == 'n': + # null object + return NullObject.readFromStream(stream) + elif tok == '<': + # hexadecimal string OR dictionary + peek = stream.read(2) + stream.seek(-2, 1) # reset to start + if peek == '<<': + return DictionaryObject.readFromStream(stream, pdf) + else: + return readHexStringFromStream(stream) + elif tok == '%': + # comment + while tok not in ('\r', '\n'): + tok = stream.read(1) + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + return readObject(stream, pdf) + else: + # number object OR indirect reference + if tok == '+' or tok == '-': + # number + return NumberObject.readFromStream(stream) + peek = stream.read(20) + stream.seek(-len(peek), 1) # reset to start + if re.match(r"(\d+)\s(\d+)\sR[^a-zA-Z]", peek) != None: + return IndirectObject.readFromStream(stream, pdf) + else: + return NumberObject.readFromStream(stream) + +class PdfObject(object): + sweep_required = False + + def getObject(self): + """Resolves indirect references.""" + return self + + +class NullObject(PdfObject): + def writeToStream(self, stream, encryption_key): + stream.write("null") + + def readFromStream(stream): + nulltxt = stream.read(4) + if nulltxt != "null": + raise utils.PdfReadError, "error reading null object" + return NullObject() + readFromStream = staticmethod(readFromStream) + + +class BooleanObject(PdfObject): + def __init__(self, value): + self.value = value + + def writeToStream(self, stream, encryption_key): + if self.value: + stream.write("true") + else: + stream.write("false") + + def readFromStream(stream): + word = stream.read(4) + if word == "true": + return BooleanObject(True) + elif word == "fals": + stream.read(1) + return BooleanObject(False) + assert False + readFromStream = staticmethod(readFromStream) + + +class ArrayObject(list, PdfObject): + sweep_required = True + + def writeToStream(self, stream, encryption_key): + stream.write("[") + for data in self: + stream.write(" ") + data.writeToStream(stream, encryption_key) + stream.write(" ]") + + def readFromStream(stream, pdf): + arr = ArrayObject() + tmp = stream.read(1) + if tmp != "[": + raise utils.PdfReadError, "error reading array" + while True: + # skip leading whitespace + tok = stream.read(1) + while tok.isspace(): + tok = stream.read(1) + stream.seek(-1, 1) + # check for array ending + peekahead = stream.read(1) + if peekahead == "]": + break + stream.seek(-1, 1) + # read and append obj + arr.append(readObject(stream, pdf)) + return arr + readFromStream = staticmethod(readFromStream) + + +class IndirectObject(PdfObject): + sweep_required = True + + def __init__(self, idnum, generation, pdf): + self.idnum = idnum + self.generation = generation + self.pdf = pdf + + def getObject(self): + return self.pdf.getObject(self).getObject() + + def __repr__(self): + return "IndirectObject(%r, %r)" % (self.idnum, self.generation) + + def __eq__(self, other): + return ( + other != None and + isinstance(other, IndirectObject) and + self.idnum == other.idnum and + self.generation == other.generation and + self.pdf is other.pdf + ) + + def __ne__(self, other): + return not self.__eq__(other) + + def writeToStream(self, stream, encryption_key): + stream.write("%s %s R" % (self.idnum, self.generation)) + + def readFromStream(stream, pdf): + idnum = "" + while True: + tok = stream.read(1) + if tok.isspace(): + break + idnum += tok + generation = "" + while True: + tok = stream.read(1) + if tok.isspace(): + break + generation += tok + r = stream.read(1) + if r != "R": + raise utils.PdfReadError("error reading indirect object reference") + return IndirectObject(int(idnum), int(generation), pdf) + readFromStream = staticmethod(readFromStream) + + +class FloatObject(decimal.Decimal, PdfObject): + def __new__(cls, value="0", context=None): + return decimal.Decimal.__new__(cls, str(value), context) + def __repr__(self): + if self == self.to_integral(): + return str(self.quantize(decimal.Decimal(1))) + else: + # XXX: this adds useless extraneous zeros. + return "%.5f" % self + def writeToStream(self, stream, encryption_key): + stream.write(repr(self)) + + +class NumberObject(int, PdfObject): + def __init__(self, value): + int.__init__(value) + + def writeToStream(self, stream, encryption_key): + stream.write(repr(self)) + + def readFromStream(stream): + name = "" + while True: + tok = stream.read(1) + if tok != '+' and tok != '-' and tok != '.' and not tok.isdigit(): + stream.seek(-1, 1) + break + name += tok + if name.find(".") != -1: + return FloatObject(name) + else: + return NumberObject(name) + readFromStream = staticmethod(readFromStream) + + +## +# Given a string (either a "str" or "unicode"), create a ByteStringObject or a +# TextStringObject to represent the string. +def createStringObject(string): + if isinstance(string, unicode): + return TextStringObject(string) + elif isinstance(string, str): + if string.startswith(codecs.BOM_UTF16_BE): + retval = TextStringObject(string.decode("utf-16")) + retval.autodetect_utf16 = True + return retval + else: + # This is probably a big performance hit here, but we need to + # convert string objects into the text/unicode-aware version if + # possible... and the only way to check if that's possible is + # to try. Some strings are strings, some are just byte arrays. + try: + retval = TextStringObject(decode_pdfdocencoding(string)) + retval.autodetect_pdfdocencoding = True + return retval + except UnicodeDecodeError: + return ByteStringObject(string) + else: + raise TypeError("createStringObject should have str or unicode arg") + + +def readHexStringFromStream(stream): + stream.read(1) + txt = "" + x = "" + while True: + tok = readNonWhitespace(stream) + if tok == ">": + break + x += tok + if len(x) == 2: + txt += chr(int(x, base=16)) + x = "" + if len(x) == 1: + x += "0" + if len(x) == 2: + txt += chr(int(x, base=16)) + return createStringObject(txt) + + +def readStringFromStream(stream): + tok = stream.read(1) + parens = 1 + txt = "" + while True: + tok = stream.read(1) + if tok == "(": + parens += 1 + elif tok == ")": + parens -= 1 + if parens == 0: + break + elif tok == "\\": + tok = stream.read(1) + if tok == "n": + tok = "\n" + elif tok == "r": + tok = "\r" + elif tok == "t": + tok = "\t" + elif tok == "b": + tok = "\b" + elif tok == "f": + tok = "\f" + elif tok == "(": + tok = "(" + elif tok == ")": + tok = ")" + elif tok == "\\": + tok = "\\" + elif tok.isdigit(): + # "The number ddd may consist of one, two, or three + # octal digits; high-order overflow shall be ignored. + # Three octal digits shall be used, with leading zeros + # as needed, if the next character of the string is also + # a digit." (PDF reference 7.3.4.2, p 16) + for i in range(2): + ntok = stream.read(1) + if ntok.isdigit(): + tok += ntok + else: + break + tok = chr(int(tok, base=8)) + elif tok in "\n\r": + # This case is hit when a backslash followed by a line + # break occurs. If it's a multi-char EOL, consume the + # second character: + tok = stream.read(1) + if not tok in "\n\r": + stream.seek(-1, 1) + # Then don't add anything to the actual string, since this + # line break was escaped: + tok = '' + else: + raise utils.PdfReadError("Unexpected escaped string") + txt += tok + return createStringObject(txt) + + +## +# Represents a string object where the text encoding could not be determined. +# This occurs quite often, as the PDF spec doesn't provide an alternate way to +# represent strings -- for example, the encryption data stored in files (like +# /O) is clearly not text, but is still stored in a "String" object. +class ByteStringObject(str, PdfObject): + + ## + # For compatibility with TextStringObject.original_bytes. This method + # returns self. + original_bytes = property(lambda self: self) + + def writeToStream(self, stream, encryption_key): + bytearr = self + if encryption_key: + bytearr = RC4_encrypt(encryption_key, bytearr) + stream.write("<") + stream.write(bytearr.encode("hex")) + stream.write(">") + + +## +# Represents a string object that has been decoded into a real unicode string. +# If read from a PDF document, this string appeared to match the +# PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to +# occur. +class TextStringObject(unicode, PdfObject): + autodetect_pdfdocencoding = False + autodetect_utf16 = False + + ## + # It is occasionally possible that a text string object gets created where + # a byte string object was expected due to the autodetection mechanism -- + # if that occurs, this "original_bytes" property can be used to + # back-calculate what the original encoded bytes were. + original_bytes = property(lambda self: self.get_original_bytes()) + + def get_original_bytes(self): + # We're a text string object, but the library is trying to get our raw + # bytes. This can happen if we auto-detected this string as text, but + # we were wrong. It's pretty common. Return the original bytes that + # would have been used to create this object, based upon the autodetect + # method. + if self.autodetect_utf16: + return codecs.BOM_UTF16_BE + self.encode("utf-16be") + elif self.autodetect_pdfdocencoding: + return encode_pdfdocencoding(self) + else: + raise Exception("no information about original bytes") + + def writeToStream(self, stream, encryption_key): + # Try to write the string out as a PDFDocEncoding encoded string. It's + # nicer to look at in the PDF file. Sadly, we take a performance hit + # here for trying... + try: + bytearr = encode_pdfdocencoding(self) + except UnicodeEncodeError: + bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") + if encryption_key: + bytearr = RC4_encrypt(encryption_key, bytearr) + obj = ByteStringObject(bytearr) + obj.writeToStream(stream, None) + else: + stream.write("(") + for c in bytearr: + if not c.isalnum() and c != ' ': + stream.write("\\%03o" % ord(c)) + else: + stream.write(c) + stream.write(")") + + +class NameObject(str, PdfObject): + delimiterCharacters = "(", ")", "<", ">", "[", "]", "{", "}", "/", "%" + + def __init__(self, data): + str.__init__(data) + + def writeToStream(self, stream, encryption_key): + stream.write(self) + + def readFromStream(stream): + name = stream.read(1) + if name != "/": + raise utils.PdfReadError, "name read error" + while True: + tok = stream.read(1) + if tok.isspace() or tok in NameObject.delimiterCharacters: + stream.seek(-1, 1) + break + name += tok + return NameObject(name) + readFromStream = staticmethod(readFromStream) + + +class DictionaryObject(dict, PdfObject): + sweep_required = True + + def __init__(self, *args, **kwargs): + if len(args) == 0: + self.update(kwargs) + elif len(args) == 1: + arr = args[0] + # If we're passed a list/tuple, make a dict out of it + if not hasattr(arr, "iteritems"): + newarr = {} + for k, v in arr: + newarr[k] = v + arr = newarr + self.update(arr) + else: + raise TypeError("dict expected at most 1 argument, got 3") + + def update(self, arr): + # note, a ValueError halfway through copying values + # will leave half the values in this dict. + for k, v in arr.iteritems(): + self.__setitem__(k, v) + + def raw_get(self, key): + return dict.__getitem__(self, key) + + def __setitem__(self, key, value): + if not isinstance(key, PdfObject): + raise ValueError("key must be PdfObject") + if not isinstance(value, PdfObject): + raise ValueError("value must be PdfObject") + return dict.__setitem__(self, key, value) + + def setdefault(self, key, value=None): + if not isinstance(key, PdfObject): + raise ValueError("key must be PdfObject") + if not isinstance(value, PdfObject): + raise ValueError("value must be PdfObject") + return dict.setdefault(self, key, value) + + def __getitem__(self, key): + return dict.__getitem__(self, key).getObject() + + ## + # Retrieves XMP (Extensible Metadata Platform) data relevant to the + # this object, if available. + #

+ # Stability: Added in v1.12, will exist for all future v1.x releases. + # @return Returns a {@link #xmp.XmpInformation XmlInformation} instance + # that can be used to access XMP metadata from the document. Can also + # return None if no metadata was found on the document root. + def getXmpMetadata(self): + metadata = self.get("/Metadata", None) + if metadata == None: + return None + metadata = metadata.getObject() + import xmp + if not isinstance(metadata, xmp.XmpInformation): + metadata = xmp.XmpInformation(metadata) + self[NameObject("/Metadata")] = metadata + return metadata + + ## + # Read-only property that accesses the {@link + # #DictionaryObject.getXmpData getXmpData} function. + #

+ # Stability: Added in v1.12, will exist for all future v1.x releases. + xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None) + + def writeToStream(self, stream, encryption_key): + stream.write("<<\n") + for key, value in self.items(): + key.writeToStream(stream, encryption_key) + stream.write(" ") + value.writeToStream(stream, encryption_key) + stream.write("\n") + stream.write(">>") + + def readFromStream(stream, pdf): + tmp = stream.read(2) + if tmp != "<<": + raise utils.PdfReadError, "dictionary read error" + data = {} + while True: + tok = readNonWhitespace(stream) + if tok == ">": + stream.read(1) + break + stream.seek(-1, 1) + key = readObject(stream, pdf) + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + value = readObject(stream, pdf) + if data.has_key(key): + # multiple definitions of key not permitted + raise utils.PdfReadError, "multiple definitions in dictionary" + data[key] = value + pos = stream.tell() + s = readNonWhitespace(stream) + if s == 's' and stream.read(5) == 'tream': + eol = stream.read(1) + # odd PDF file output has spaces after 'stream' keyword but before EOL. + # patch provided by Danial Sandler + while eol == ' ': + eol = stream.read(1) + assert eol in ("\n", "\r") + if eol == "\r": + # read \n after + stream.read(1) + # this is a stream object, not a dictionary + assert data.has_key("/Length") + length = data["/Length"] + if isinstance(length, IndirectObject): + t = stream.tell() + length = pdf.getObject(length) + stream.seek(t, 0) + data["__streamdata__"] = stream.read(length) + e = readNonWhitespace(stream) + ndstream = stream.read(8) + if (e + ndstream) != "endstream": + # (sigh) - the odd PDF file has a length that is too long, so + # we need to read backwards to find the "endstream" ending. + # ReportLab (unknown version) generates files with this bug, + # and Python users into PDF files tend to be our audience. + # we need to do this to correct the streamdata and chop off + # an extra character. + pos = stream.tell() + stream.seek(-10, 1) + end = stream.read(9) + if end == "endstream": + # we found it by looking back one character further. + data["__streamdata__"] = data["__streamdata__"][:-1] + else: + stream.seek(pos, 0) + raise utils.PdfReadError, "Unable to find 'endstream' marker after stream." + else: + stream.seek(pos, 0) + if data.has_key("__streamdata__"): + return StreamObject.initializeFromDictionary(data) + else: + retval = DictionaryObject() + retval.update(data) + return retval + readFromStream = staticmethod(readFromStream) + +class TreeObject(DictionaryObject): + def __init__(self): + DictionaryObject.__init__(self) + + def hasChildren(self): + return self.has_key('/First') + + def __iter__(self): + return self.children() + + def children(self): + if not self.hasChildren(): + raise StopIteration + + child = self['/First'] + while True: + yield child + if child == self['/Last']: + raise StopIteration + child = child['/Next'] + + def addChild(self, child, pdf): + childObj = child.getObject() + child = pdf.getReference(childObj) + assert isinstance(child, IndirectObject) + + if not self.has_key('/First'): + self[NameObject('/First')] = child + self[NameObject('/Count')] = NumberObject(0) + prev = None + else: + prev = self['/Last'] + + self[NameObject('/Last')] = child + self[NameObject('/Count')] = NumberObject(self[NameObject('/Count')] + 1) + + if prev: + prevRef = pdf.getReference(prev) + assert isinstance(prevRef, IndirectObject) + childObj[NameObject('/Prev')] = prevRef + prev[NameObject('/Next')] = child + + parentRef = pdf.getReference(self) + assert isinstance(parentRef, IndirectObject) + childObj[NameObject('/Parent')] = parentRef + + def removeChild(self, child): + childObj = child.getObject() + + if not childObj.has_key(NameObject('/Parent')): + raise ValueError, "Removed child does not appear to be a tree item" + elif childObj[NameObject('/Parent')] != self: + raise ValueError, "Removed child is not a member of this tree" + + found = False + prevRef = None + prev = None + curRef = self[NameObject('/First')] + cur = curRef.getObject() + lastRef = self[NameObject('/Last')] + last = lastRef.getObject() + while cur != None: + if cur == childObj: + if prev == None: + if cur.has_key(NameObject('/Next')): + # Removing first tree node + nextRef = cur[NameObject('/Next')] + next = nextRef.getObject() + del next[NameObject('/Prev')] + self[NameObject('/First')] = nextRef + self[NameObject('/Count')] = self[NameObject('/Count')] - 1 + + else: + # Removing only tree node + assert self[NameObject('/Count')] == 1 + del self[NameObject('/Count')] + del self[NameObject('/First')] + if self.has_key(NameObject('/Last')): + del self[NameObject('/Last')] + else: + if cur.has_key(NameObject('/Next')): + # Removing middle tree node + nextRef = cur[NameObject('/Next')] + next = nextRef.getObject() + next[NameObject('/Prev')] = prevRef + prev[NameObject('/Next')] = nextRef + self[NameObject('/Count')] = self[NameObject('/Count')] - 1 + else: + # Removing last tree node + assert cur == last + del prev[NameObject('/Next')] + self[NameObject('/Last')] = prevRef + self[NameObject('/Count')] = self[NameObject('/Count')] - 1 + found = True + break + + + prevRef = curRef + prev = cur + if cur.has_key(NameObject('/Next')): + curRef = cur[NameObject('/Next')] + cur = curRef.getObject() + else: + curRef = None + cur = None + + if not found: + raise ValueError, "Removal couldn't find item in tree" + + del childObj[NameObject('/Parent')] + if childObj.has_key(NameObject('/Next')): + del childObj[NameObject('/Next')] + if childObj.has_key(NameObject('/Prev')): + del childObj[NameObject('/Prev')] + + def emptyTree(self): + for child in self: + childObj = child.getObject() + del childObj[NameObject('/Parent')] + if childObj.has_key(NameObject('/Next')): + del childObj[NameObject('/Next')] + if childObj.has_key(NameObject('/Prev')): + del childObj[NameObject('/Prev')] + + if self.has_key(NameObject('/Count')): + del self[NameObject('/Count')] + if self.has_key(NameObject('/First')): + del self[NameObject('/First')] + if self.has_key(NameObject('/Last')): + del self[NameObject('/Last')] + + +class StreamObject(DictionaryObject): + def __init__(self): + self._data = None + self.decodedSelf = None + + def writeToStream(self, stream, encryption_key): + self[NameObject("/Length")] = NumberObject(len(self._data)) + DictionaryObject.writeToStream(self, stream, encryption_key) + del self["/Length"] + stream.write("\nstream\n") + data = self._data + if encryption_key: + data = RC4_encrypt(encryption_key, data) + stream.write(data) + stream.write("\nendstream") + + def initializeFromDictionary(data): + if data.has_key("/Filter"): + retval = EncodedStreamObject() + else: + retval = DecodedStreamObject() + retval._data = data["__streamdata__"] + del data["__streamdata__"] + del data["/Length"] + retval.update(data) + return retval + initializeFromDictionary = staticmethod(initializeFromDictionary) + + def flateEncode(self): + if self.has_key("/Filter"): + f = self["/Filter"] + if isinstance(f, ArrayObject): + f.insert(0, NameObject("/FlateDecode")) + else: + newf = ArrayObject() + newf.append(NameObject("/FlateDecode")) + newf.append(f) + f = newf + else: + f = NameObject("/FlateDecode") + retval = EncodedStreamObject() + retval[NameObject("/Filter")] = f + retval._data = filters.FlateDecode.encode(self._data) + return retval + + +class DecodedStreamObject(StreamObject): + def getData(self): + return self._data + + def setData(self, data): + self._data = data + + +class EncodedStreamObject(StreamObject): + def __init__(self): + self.decodedSelf = None + + def getData(self): + if self.decodedSelf: + # cached version of decoded object + return self.decodedSelf.getData() + else: + # create decoded object + decoded = DecodedStreamObject() + decoded._data = filters.decodeStreamData(self) + for key, value in self.items(): + if not key in ("/Length", "/Filter", "/DecodeParms"): + decoded[key] = value + self.decodedSelf = decoded + return decoded._data + + def setData(self, data): + raise utils.PdfReadError, "Creating EncodedStreamObject is not currently supported" + + +class RectangleObject(ArrayObject): + def __init__(self, arr): + # must have four points + assert len(arr) == 4 + # automatically convert arr[x] into NumberObject(arr[x]) if necessary + ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr]) + + def ensureIsNumber(self, value): + if not isinstance(value, (NumberObject, FloatObject)): + value = FloatObject(value) + return value + + def __repr__(self): + return "RectangleObject(%s)" % repr(list(self)) + + def getLowerLeft_x(self): + return self[0] + + def getLowerLeft_y(self): + return self[1] + + def getUpperRight_x(self): + return self[2] + + def getUpperRight_y(self): + return self[3] + + def getUpperLeft_x(self): + return self.getLowerLeft_x() + + def getUpperLeft_y(self): + return self.getUpperRight_y() + + def getLowerRight_x(self): + return self.getUpperRight_x() + + def getLowerRight_y(self): + return self.getLowerLeft_y() + + def getLowerLeft(self): + return self.getLowerLeft_x(), self.getLowerLeft_y() + + def getLowerRight(self): + return self.getLowerRight_x(), self.getLowerRight_y() + + def getUpperLeft(self): + return self.getUpperLeft_x(), self.getUpperLeft_y() + + def getUpperRight(self): + return self.getUpperRight_x(), self.getUpperRight_y() + + def setLowerLeft(self, value): + self[0], self[1] = [self.ensureIsNumber(x) for x in value] + + def setLowerRight(self, value): + self[2], self[1] = [self.ensureIsNumber(x) for x in value] + + def setUpperLeft(self, value): + self[0], self[3] = [self.ensureIsNumber(x) for x in value] + + def setUpperRight(self, value): + self[2], self[3] = [self.ensureIsNumber(x) for x in value] + + def getWidth(self): + return self.getUpperRight_x() - self.getLowerLeft_x() + + def getHeight(self): + return self.getUpperRight_y() - self.getLowerLeft_x() + + lowerLeft = property(getLowerLeft, setLowerLeft, None, None) + lowerRight = property(getLowerRight, setLowerRight, None, None) + upperLeft = property(getUpperLeft, setUpperLeft, None, None) + upperRight = property(getUpperRight, setUpperRight, None, None) + + + +## +# A class representing a destination within a PDF file. +# See section 8.2.1 of the PDF 1.6 reference. +# Stability: Added in v1.10, will exist for all v1.x releases. +class Destination(TreeObject): + def __init__(self, title, page, typ, *args): + DictionaryObject.__init__(self) + self[NameObject("/Title")] = title + self[NameObject("/Page")] = page + self[NameObject("/Type")] = typ + + # from table 8.2 of the PDF 1.6 reference. + if typ == "/XYZ": + (self[NameObject("/Left")], self[NameObject("/Top")], + self[NameObject("/Zoom")]) = args + elif typ == "/FitR": + (self[NameObject("/Left")], self[NameObject("/Bottom")], + self[NameObject("/Right")], self[NameObject("/Top")]) = args + elif typ in ["/FitH", "FitBH"]: + self[NameObject("/Top")], = args + elif typ in ["/FitV", "FitBV"]: + self[NameObject("/Left")], = args + elif typ in ["/Fit", "FitB"]: + pass + else: + raise utils.PdfReadError("Unknown Destination Type: %r" % typ) + + def getDestArray(self): + return ArrayObject([self.raw_get('/Page'), self['/Type']] + [self[x] for x in ['/Left','/Bottom','/Right','/Top','/Zoom'] if self.has_key(x)]) + + def writeToStream(self, stream, encryption_key): + stream.write("<<\n") + + key = NameObject('/D') + key.writeToStream(stream, encryption_key) + stream.write(" ") + value = self.getDestArray() + value.writeToStream(stream, encryption_key) + + key = NameObject("/S") + key.writeToStream(stream, encryption_key) + stream.write(" ") + value = NameObject("/GoTo") + value.writeToStream(stream, encryption_key) + + stream.write("\n") + stream.write(">>") + + ## + # Read-only property accessing the destination title. + # @return A string. + title = property(lambda self: self.get("/Title")) + + ## + # Read-only property accessing the destination page. + # @return An integer. + page = property(lambda self: self.get("/Page")) + + ## + # Read-only property accessing the destination type. + # @return A string. + typ = property(lambda self: self.get("/Type")) + + ## + # Read-only property accessing the zoom factor. + # @return A number, or None if not available. + zoom = property(lambda self: self.get("/Zoom", None)) + + ## + # Read-only property accessing the left horizontal coordinate. + # @return A number, or None if not available. + left = property(lambda self: self.get("/Left", None)) + + ## + # Read-only property accessing the right horizontal coordinate. + # @return A number, or None if not available. + right = property(lambda self: self.get("/Right", None)) + + ## + # Read-only property accessing the top vertical coordinate. + # @return A number, or None if not available. + top = property(lambda self: self.get("/Top", None)) + + ## + # Read-only property accessing the bottom vertical coordinate. + # @return A number, or None if not available. + bottom = property(lambda self: self.get("/Bottom", None)) + + +class Bookmark(Destination): + def writeToStream(self, stream, encryption_key): + stream.write("<<\n") + for key in [NameObject(x) for x in ['/Title', '/Parent', '/First', '/Last', '/Next', '/Prev'] if self.has_key(x)]: + key.writeToStream(stream, encryption_key) + stream.write(" ") + value = self.raw_get(key) + value.writeToStream(stream, encryption_key) + stream.write("\n") + key = NameObject('/Dest') + key.writeToStream(stream, encryption_key) + stream.write(" ") + value = self.getDestArray() + value.writeToStream(stream, encryption_key) + stream.write("\n") + stream.write(">>") + + + +def encode_pdfdocencoding(unicode_string): + retval = '' + for c in unicode_string: + try: + retval += chr(_pdfDocEncoding_rev[c]) + except KeyError: + raise UnicodeEncodeError("pdfdocencoding", c, -1, -1, + "does not exist in translation table") + return retval + +def decode_pdfdocencoding(byte_array): + retval = u'' + for b in byte_array: + c = _pdfDocEncoding[ord(b)] + if c == u'\u0000': + raise UnicodeDecodeError("pdfdocencoding", b, -1, -1, + "does not exist in translation table") + retval += c + return retval + +_pdfDocEncoding = ( + u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', + u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', + u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', u'\u0000', + u'\u02d8', u'\u02c7', u'\u02c6', u'\u02d9', u'\u02dd', u'\u02db', u'\u02da', u'\u02dc', + u'\u0020', u'\u0021', u'\u0022', u'\u0023', u'\u0024', u'\u0025', u'\u0026', u'\u0027', + u'\u0028', u'\u0029', u'\u002a', u'\u002b', u'\u002c', u'\u002d', u'\u002e', u'\u002f', + u'\u0030', u'\u0031', u'\u0032', u'\u0033', u'\u0034', u'\u0035', u'\u0036', u'\u0037', + u'\u0038', u'\u0039', u'\u003a', u'\u003b', u'\u003c', u'\u003d', u'\u003e', u'\u003f', + u'\u0040', u'\u0041', u'\u0042', u'\u0043', u'\u0044', u'\u0045', u'\u0046', u'\u0047', + u'\u0048', u'\u0049', u'\u004a', u'\u004b', u'\u004c', u'\u004d', u'\u004e', u'\u004f', + u'\u0050', u'\u0051', u'\u0052', u'\u0053', u'\u0054', u'\u0055', u'\u0056', u'\u0057', + u'\u0058', u'\u0059', u'\u005a', u'\u005b', u'\u005c', u'\u005d', u'\u005e', u'\u005f', + u'\u0060', u'\u0061', u'\u0062', u'\u0063', u'\u0064', u'\u0065', u'\u0066', u'\u0067', + u'\u0068', u'\u0069', u'\u006a', u'\u006b', u'\u006c', u'\u006d', u'\u006e', u'\u006f', + u'\u0070', u'\u0071', u'\u0072', u'\u0073', u'\u0074', u'\u0075', u'\u0076', u'\u0077', + u'\u0078', u'\u0079', u'\u007a', u'\u007b', u'\u007c', u'\u007d', u'\u007e', u'\u0000', + u'\u2022', u'\u2020', u'\u2021', u'\u2026', u'\u2014', u'\u2013', u'\u0192', u'\u2044', + u'\u2039', u'\u203a', u'\u2212', u'\u2030', u'\u201e', u'\u201c', u'\u201d', u'\u2018', + u'\u2019', u'\u201a', u'\u2122', u'\ufb01', u'\ufb02', u'\u0141', u'\u0152', u'\u0160', + u'\u0178', u'\u017d', u'\u0131', u'\u0142', u'\u0153', u'\u0161', u'\u017e', u'\u0000', + u'\u20ac', u'\u00a1', u'\u00a2', u'\u00a3', u'\u00a4', u'\u00a5', u'\u00a6', u'\u00a7', + u'\u00a8', u'\u00a9', u'\u00aa', u'\u00ab', u'\u00ac', u'\u0000', u'\u00ae', u'\u00af', + u'\u00b0', u'\u00b1', u'\u00b2', u'\u00b3', u'\u00b4', u'\u00b5', u'\u00b6', u'\u00b7', + u'\u00b8', u'\u00b9', u'\u00ba', u'\u00bb', u'\u00bc', u'\u00bd', u'\u00be', u'\u00bf', + u'\u00c0', u'\u00c1', u'\u00c2', u'\u00c3', u'\u00c4', u'\u00c5', u'\u00c6', u'\u00c7', + u'\u00c8', u'\u00c9', u'\u00ca', u'\u00cb', u'\u00cc', u'\u00cd', u'\u00ce', u'\u00cf', + u'\u00d0', u'\u00d1', u'\u00d2', u'\u00d3', u'\u00d4', u'\u00d5', u'\u00d6', u'\u00d7', + u'\u00d8', u'\u00d9', u'\u00da', u'\u00db', u'\u00dc', u'\u00dd', u'\u00de', u'\u00df', + u'\u00e0', u'\u00e1', u'\u00e2', u'\u00e3', u'\u00e4', u'\u00e5', u'\u00e6', u'\u00e7', + u'\u00e8', u'\u00e9', u'\u00ea', u'\u00eb', u'\u00ec', u'\u00ed', u'\u00ee', u'\u00ef', + u'\u00f0', u'\u00f1', u'\u00f2', u'\u00f3', u'\u00f4', u'\u00f5', u'\u00f6', u'\u00f7', + u'\u00f8', u'\u00f9', u'\u00fa', u'\u00fb', u'\u00fc', u'\u00fd', u'\u00fe', u'\u00ff' +) + +assert len(_pdfDocEncoding) == 256 + +_pdfDocEncoding_rev = {} +for i in xrange(256): + char = _pdfDocEncoding[i] + if char == u"\u0000": + continue + assert char not in _pdfDocEncoding_rev + _pdfDocEncoding_rev[char] = i + diff --git a/PyPDF2/merger.py b/PyPDF2/merger.py new file mode 100644 index 0000000..383d345 --- /dev/null +++ b/PyPDF2/merger.py @@ -0,0 +1,401 @@ +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from generic import * +from pdf import PdfFileReader, PdfFileWriter, Destination + +class _MergedPage(object): + """ + _MergedPage is used internally by PdfFileMerger to collect necessary information on each page that is being merged. + """ + def __init__(self, pagedata, src, id): + self.src = src + self.pagedata = pagedata + self.out_pagedata = None + self.id = id + +class PdfFileMerger(object): + """ + PdfFileMerger merges multiple PDFs into a single PDF. It can concatenate, + slice, insert, or any combination of the above. + + See the functions "merge" (or "append") and "write" (or "overwrite") for + usage information. + """ + + def __init__(self): + """ + >>> PdfFileMerger() + + Initializes a PdfFileMerger, no parameters required + """ + self.inputs = [] + self.pages = [] + self.output = PdfFileWriter() + self.bookmarks = [] + self.named_dests = [] + self.id_count = 0 + + def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=True): + """ + >>> merge(position, file, bookmark=None, pages=None, import_bookmarks=True) + + Merges the pages from the source document specified by "file" into the output + file at the page number specified by "position". + + Optionally, you may specify a bookmark to be applied at the beginning of the + included file by supplying the text of the bookmark in the "bookmark" parameter. + + You may prevent the source document's bookmarks from being imported by + specifying "import_bookmarks" as False. + + You may also use the "pages" parameter to merge only the specified range of + pages from the source document into the output document. + """ + + my_file = False + if type(fileobj) in (str, unicode): + fileobj = file(fileobj, 'rb') + my_file = True + + if type(fileobj) == PdfFileReader: + pdfr = fileobj + fileobj = pdfr.file + else: + pdfr = PdfFileReader(fileobj) + + # Find the range of pages to merge + if pages == None: + pages = (0, pdfr.getNumPages()) + elif type(pages) in (int, float, str, unicode): + raise TypeError('"pages" must be a tuple of (start, end)') + + srcpages = [] + + if bookmark: + bookmark = Bookmark(TextStringObject(bookmark), NumberObject(self.id_count), NameObject('/Fit')) + + outline = [] + if import_bookmarks: + outline = pdfr.getOutlines() + outline = self._trim_outline(pdfr, outline, pages) + + if bookmark: + self.bookmarks += [bookmark, outline] + else: + self.bookmarks += outline + + dests = pdfr.namedDestinations + dests = self._trim_dests(pdfr, dests, pages) + self.named_dests += dests + + # Gather all the pages that are going to be merged + for i in range(*pages): + pg = pdfr.getPage(i) + + id = self.id_count + self.id_count += 1 + + mp = _MergedPage(pg, pdfr, id) + + srcpages.append(mp) + + self._associate_dests_to_pages(srcpages) + self._associate_bookmarks_to_pages(srcpages) + + + # Slice to insert the pages at the specified position + self.pages[position:position] = srcpages + + # Keep track of our input files so we can close them later + self.inputs.append((fileobj, pdfr, my_file)) + + + def append(self, fileobj, bookmark=None, pages=None, import_bookmarks=True): + """ + >>> append(file, bookmark=None, pages=None, import_bookmarks=True): + + Identical to the "merge" function, but assumes you want to concatenate all pages + onto the end of the file instead of specifying a position. + """ + + self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks) + + + def write(self, fileobj): + """ + >>> write(file) + + Writes all data that has been merged to "file" (which can be a filename or any + kind of file-like object) + """ + my_file = False + if type(fileobj) in (str, unicode): + fileobj = file(fileobj, 'wb') + my_file = True + + + # Add pages to the PdfFileWriter + for page in self.pages: + self.output.addPage(page.pagedata) + page.out_pagedata = self.output.getReference(self.output._pages.getObject()["/Kids"][-1].getObject()) + + + # Once all pages are added, create bookmarks to point at those pages + self._write_dests() + self._write_bookmarks() + + # Write the output to the file + self.output.write(fileobj) + + if my_file: + fileobj.close() + + + + def close(self): + """ + >>> close() + + Shuts all file descriptors (input and output) and clears all memory usage + """ + self.pages = [] + for fo, pdfr, mine in self.inputs: + if mine: + fo.close() + + self.inputs = [] + self.output = None + + def _trim_dests(self, pdf, dests, pages): + """ + Removes any named destinations that are not a part of the specified page set + """ + new_dests = [] + prev_header_added = True + for k, o in dests.items(): + for j in range(*pages): + if pdf.getPage(j).getObject() == o['/Page'].getObject(): + o[NameObject('/Page')] = o['/Page'].getObject() + assert str(k) == str(o['/Title']) + new_dests.append(o) + break + return new_dests + + def _trim_outline(self, pdf, outline, pages): + """ + Removes any outline/bookmark entries that are not a part of the specified page set + """ + new_outline = [] + prev_header_added = True + for i, o in enumerate(outline): + if type(o) == list: + sub = self._trim_outline(pdf, o, pages) + if sub: + if not prev_header_added: + new_outline.append(outline[i-1]) + new_outline.append(sub) + else: + prev_header_added = False + for j in range(*pages): + if pdf.getPage(j).getObject() == o['/Page'].getObject(): + o[NameObject('/Page')] = o['/Page'].getObject() + new_outline.append(o) + prev_header_added = True + break + return new_outline + + def _write_dests(self): + dests = self.named_dests + + for v in dests: + pageno = None + pdf = None + if v.has_key('/Page'): + for i, p in enumerate(self.pages): + if p.id == v['/Page']: + v[NameObject('/Page')] = p.out_pagedata + pageno = i + pdf = p.src + if pageno != None: + self.output.addNamedDestinationObject(v) + + def _write_bookmarks(self, bookmarks=None, parent=None): + + if bookmarks == None: + bookmarks = self.bookmarks + + + last_added = None + for b in bookmarks: + if type(b) == list: + self._write_bookmarks(b, last_added) + continue + + pageno = None + pdf = None + if b.has_key('/Page'): + for i, p in enumerate(self.pages): + if p.id == b['/Page']: + b[NameObject('/Page')] = p.out_pagedata + pageno = i + pdf = p.src + if pageno != None: + last_added = self.output.addBookmarkDestination(b, parent) + + + def _associate_dests_to_pages(self, pages): + for nd in self.named_dests: + pageno = None + np = nd['/Page'] + + if type(np) == NumberObject: + continue + + for p in pages: + if np.getObject() == p.pagedata.getObject(): + pageno = p.id + + if pageno != None: + nd[NameObject('/Page')] = NumberObject(pageno) + else: + raise ValueError, "Unresolved named destination '%s'" % (nd['/Title'],) + + def _associate_bookmarks_to_pages(self, pages, bookmarks=None): + if bookmarks == None: + bookmarks = self.bookmarks + + for b in bookmarks: + if type(b) == list: + self._associate_bookmarks_to_pages(pages, b) + continue + + pageno = None + bp = b['/Page'] + + if type(bp) == NumberObject: + continue + + for p in pages: + if bp.getObject() == p.pagedata.getObject(): + pageno = p.id + + if pageno != None: + b[NameObject('/Page')] = NumberObject(pageno) + else: + raise ValueError, "Unresolved bookmark '%s'" % (b['/Title'],) + + def findBookmark(self, bookmark, root=None): + if root == None: + root = self.bookmarks + + for i, b in enumerate(root): + if type(b) == list: + res = self.findBookmark(bookmark, b) + if res: + return [i] + res + if b == bookmark or b['/Title'] == bookmark: + return [i] + + return None + + def addBookmark(self, title, pagenum, parent=None): + """ + Add a bookmark to the pdf, using the specified title and pointing at + the specified page number. A parent can be specified to make this a + nested bookmark below the parent. + """ + + if parent == None: + iloc = [len(self.bookmarks)-1] + elif type(parent) == list: + iloc = parent + else: + iloc = self.findBookmark(parent) + + dest = Bookmark(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826)) + + if parent == None: + self.bookmarks.append(dest) + else: + bmparent = self.bookmarks + for i in iloc[:-1]: + bmparent = bmparent[i] + npos = iloc[-1]+1 + if npos < len(bmparent) and type(bmparent[npos]) == list: + bmparent[npos].append(dest) + else: + bmparent.insert(npos, [dest]) + + + def addNamedDestination(self, title, pagenum): + """ + Add a destination to the pdf, using the specified title and pointing + at the specified page number. + """ + + dest = Destination(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826)) + self.named_dests.append(dest) + + +class OutlinesObject(list): + def __init__(self, pdf, tree, parent=None): + list.__init__(self) + self.tree = tree + self.pdf = pdf + self.parent = parent + + def remove(self, index): + obj = self[index] + del self[index] + self.tree.removeChild(obj) + + def add(self, title, page): + pageRef = self.pdf.getObject(self.pdf._pages)['/Kids'][pagenum] + action = DictionaryObject() + action.update({ + NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), + NameObject('/S') : NameObject('/GoTo') + }) + actionRef = self.pdf._addObject(action) + bookmark = TreeObject() + + bookmark.update({ + NameObject('/A') : actionRef, + NameObject('/Title') : createStringObject(title), + }) + + pdf._addObject(bookmark) + + self.tree.addChild(bookmark) + + def removeAll(self): + for child in [x for x in self.tree.children()]: + self.tree.removeChild(child) + self.pop() \ No newline at end of file diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py new file mode 100644 index 0000000..644a18f --- /dev/null +++ b/PyPDF2/pdf.py @@ -0,0 +1,2013 @@ +# -*- coding: utf-8 -*- +# +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# Copyright (c) 2007, Ashish Kulkarni +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +""" +A pure-Python PDF library with very minimal capabilities. It was designed to +be able to split and merge PDF files by page, and that's about all it can do. +It may be a solid base for future PDF file work in Python. +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +import math +import struct +from sys import version_info +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + +import filters +import utils +import warnings +from generic import * +from utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList + +if version_info < ( 2, 4 ): + from sets import ImmutableSet as frozenset + +if version_info < ( 2, 5 ): + from md5 import md5 +else: + from hashlib import md5 + +class InternalObjectException(Exception): + pass +## +# This class supports writing PDF files out, given pages produced by another +# class (typically {@link #PdfFileReader PdfFileReader}). +class PdfFileWriter(object): + def __init__(self): + self._header = "%PDF-1.3" + self._objects = [] # array of indirect objects + + # The root of our page tree node. + pages = DictionaryObject() + pages.update({ + NameObject("/Type"): NameObject("/Pages"), + NameObject("/Count"): NumberObject(0), + NameObject("/Kids"): ArrayObject(), + }) + self._pages = self._addObject(pages) + + # info object + info = DictionaryObject() + info.update({ + NameObject("/Producer"): createStringObject(u"Python PDF Library - http://pybrary.net/pyPdf/") + }) + self._info = self._addObject(info) + + # root object + root = DictionaryObject() + root.update({ + NameObject("/Type"): NameObject("/Catalog"), + NameObject("/Pages"): self._pages, + }) + self._root = self._addObject(root) + self._swept_cache = {} # cache of objects that have already been swept for references + + def _addObject(self, obj): + self._objects.append(obj) + return IndirectObject(len(self._objects), 0, self) + + def getObject(self, ido): + if ido.pdf != self: + raise ValueError("pdf must be self") + return self._objects[ido.idnum - 1] + + def getReference(self, obj): + idnum = self._objects.index(obj) + 1 + ref = IndirectObject(idnum, 0, self) + assert ref.getObject() == obj + return ref + + ## + # Common method for inserting or adding a page to this PDF file. + # + # @param page The page to add to the document. This argument should be + # an instance of {@link #PageObject PageObject}. + # @param action The function which will insert the page in the dictionnary. + # Takes: page list, page to add. + def _addPage(self, page, action): + assert page["/Type"] == "/Page" + page[NameObject("/Parent")] = self._pages + page = self._addObject(page) + pages = self.getObject(self._pages) + action(pages["/Kids"], page) + pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1) + + ## + # Adds a page to this PDF file. The page is usually acquired from a + # {@link #PdfFileReader PdfFileReader} instance. + #

+ # Stability: Added in v1.0, will exist for all v1.x releases. + # + # @param page The page to add to the document. This argument should be + # an instance of {@link #PageObject PageObject}. + def addPage(self, page): + self._addPage(page, list.append) + + ## + # Insert a page in this PDF file. The page is usually acquired from a + # {@link #PdfFileReader PdfFileReader} instance. + # + # @param page The page to add to the document. This argument should be + # an instance of {@link #PageObject PageObject}. + # @param index Position at which the page will be inserted. + def insertPage(self, page, index=0): + self._addPage(page, lambda l, p: l.insert(index, p)) + + ## + # Retrieves a page by number from this PDF file. + # @return Returns a {@link #PageObject PageObject} instance. + def getPage(self, pageNumber): + pages = self.getObject(self._pages) + # XXX: crude hack + return pages["/Kids"][pageNumber].getObject() + + ## + # Return the number of pages. + # @return The number of pages. + def getNumPages(self): + pages = self.getObject(self._pages) + return int(pages[NameObject("/Count")]) + + ## + # Append a blank page to this PDF file and returns it. If no page size + # is specified, use the size of the last page; throw + # PageSizeNotDefinedError if it doesn't exist. + # @param width The width of the new page expressed in default user + # space units. + # @param height The height of the new page expressed in default user + # space units. + def addBlankPage(self, width=None, height=None): + page = PageObject.createBlankPage(self, width, height) + self.addPage(page) + return page + + ## + # Insert a blank page to this PDF file and returns it. If no page size + # is specified, use the size of the page in the given index; throw + # PageSizeNotDefinedError if it doesn't exist. + # @param width The width of the new page expressed in default user + # space units. + # @param height The height of the new page expressed in default user + # space units. + # @param index Position to add the page. + def insertBlankPage(self, width=None, height=None, index=0): + if width is None or height is None and \ + (self.getNumPages() - 1) >= index: + oldpage = self.getPage(index) + width = oldpage.mediaBox.getWidth() + height = oldpage.mediaBox.getHeight() + page = PageObject.createBlankPage(self, width, height) + self.insertPage(page, index) + return page + + ## + # Encrypt this PDF file with the PDF Standard encryption handler. + # @param user_pwd The "user password", which allows for opening and reading + # the PDF file with the restrictions provided. + # @param owner_pwd The "owner password", which allows for opening the PDF + # files without any restrictions. By default, the owner password is the + # same as the user password. + # @param use_128bit Boolean argument as to whether to use 128bit + # encryption. When false, 40bit encryption will be used. By default, this + # flag is on. + def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True): + import time, random + if owner_pwd == None: + owner_pwd = user_pwd + if use_128bit: + V = 2 + rev = 3 + keylen = 128 / 8 + else: + V = 1 + rev = 2 + keylen = 40 / 8 + # permit everything: + P = -1 + O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen)) + ID_1 = md5(repr(time.time())).digest() + ID_2 = md5(repr(random.random())).digest() + self._ID = ArrayObject((ByteStringObject(ID_1), ByteStringObject(ID_2))) + if rev == 2: + U, key = _alg34(user_pwd, O, P, ID_1) + else: + assert rev == 3 + U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False) + encrypt = DictionaryObject() + encrypt[NameObject("/Filter")] = NameObject("/Standard") + encrypt[NameObject("/V")] = NumberObject(V) + if V == 2: + encrypt[NameObject("/Length")] = NumberObject(keylen * 8) + encrypt[NameObject("/R")] = NumberObject(rev) + encrypt[NameObject("/O")] = ByteStringObject(O) + encrypt[NameObject("/U")] = ByteStringObject(U) + encrypt[NameObject("/P")] = NumberObject(P) + self._encrypt = self._addObject(encrypt) + self._encrypt_key = key + + ## + # Writes the collection of pages added to this object out as a PDF file. + #

+ # Stability: Added in v1.0, will exist for all v1.x releases. + # @param stream An object to write the file to. The object must support + # the write method, and the tell method, similar to a file object. + def write(self, stream): + import struct + + externalReferenceMap = {} + self.stack = [] + self._swept_cache = {} + self._sweepIndirectReferences(externalReferenceMap, self._root) + self._swept_cache = {} + del self.stack + + # Begin writing: + object_positions = [] + stream.write(self._header + "\n") + for i in xrange(len(self._objects)): + idnum = (i + 1) + obj = self._objects[i] + object_positions.append(stream.tell()) + stream.write(str(idnum) + " 0 obj\n") + key = None + if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum: + pack1 = struct.pack(" +# Stability: Added in v1.0, will exist for all v1.x releases. +# +# @param stream An object that supports the standard read and seek methods +# similar to a file object. +class PdfFileReader(object): + + def __init__(self, stream): + self.flattenedPages = None + self.resolvedObjects = {} + self.read(stream) + self.stream = stream + self._override_encryption = False + + ## + # Retrieves the PDF file's document information dictionary, if it exists. + # Note that some PDF files use metadata streams instead of docinfo + # dictionaries, and these metadata streams will not be accessed by this + # function. + #

+ # Stability: Added in v1.6, will exist for all future v1.x releases. + # @return Returns a {@link #DocumentInformation DocumentInformation} + # instance, or None if none exists. + def getDocumentInfo(self): + if not self.trailer.has_key("/Info"): + return None + obj = self.trailer['/Info'] + retval = DocumentInformation() + retval.update(obj) + return retval + + ## + # Read-only property that accesses the {@link + # #PdfFileReader.getDocumentInfo getDocumentInfo} function. + #

+ # Stability: Added in v1.7, will exist for all future v1.x releases. + documentInfo = property(lambda self: self.getDocumentInfo(), None, None) + + ## + # Retrieves XMP (Extensible Metadata Platform) data from the PDF document + # root. + #

+ # Stability: Added in v1.12, will exist for all future v1.x releases. + # @return Returns a {@link #generic.XmpInformation XmlInformation} + # instance that can be used to access XMP metadata from the document. + # Can also return None if no metadata was found on the document root. + def getXmpMetadata(self): + try: + self._override_encryption = True + return self.trailer["/Root"].getXmpMetadata() + finally: + self._override_encryption = False + + ## + # Read-only property that accesses the {@link #PdfFileReader.getXmpData + # getXmpData} function. + #

+ # Stability: Added in v1.12, will exist for all future v1.x releases. + xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None) + + ## + # Calculates the number of pages in this PDF file. + #

+ # Stability: Added in v1.0, will exist for all v1.x releases. + # @return Returns an integer. + def getNumPages(self): + if self.flattenedPages == None: + self._flatten() + return len(self.flattenedPages) + + ## + # Read-only property that accesses the {@link #PdfFileReader.getNumPages + # getNumPages} function. + #

+ # Stability: Added in v1.7, will exist for all future v1.x releases. + numPages = property(lambda self: self.getNumPages(), None, None) + + ## + # Retrieves a page by number from this PDF file. + #

+ # Stability: Added in v1.0, will exist for all v1.x releases. + # @return Returns a {@link #PageObject PageObject} instance. + def getPage(self, pageNumber): + ## ensure that we're not trying to access an encrypted PDF + #assert not self.trailer.has_key("/Encrypt") + if self.flattenedPages == None: + self._flatten() + return self.flattenedPages[pageNumber] + + ## + # Read-only property that accesses the + # {@link #PdfFileReader.getNamedDestinations + # getNamedDestinations} function. + #

+ # Stability: Added in v1.10, will exist for all future v1.x releases. + namedDestinations = property(lambda self: + self.getNamedDestinations(), None, None) + + ## + # Retrieves the named destinations present in the document. + #

+ # Stability: Added in v1.10, will exist for all future v1.x releases. + # @return Returns a dict which maps names to {@link #Destination + # destinations}. + def getNamedDestinations(self, tree=None, retval=None): + if retval == None: + retval = {} + catalog = self.trailer["/Root"] + + # get the name tree + if catalog.has_key("/Dests"): + tree = catalog["/Dests"] + elif catalog.has_key("/Names"): + names = catalog['/Names'] + if isinstance(names, DictionaryObject) and names.has_key("/Dests"): + tree = names['/Dests'] + + if tree == None or not isinstance(tree, DictionaryObject): + return retval + + if tree.has_key("/Kids"): + # recurse down the tree + for kid in tree["/Kids"]: + self.getNamedDestinations(kid.getObject(), retval) + + if tree.has_key("/Names"): + names = tree["/Names"] + for i in xrange(0, len(names), 2): + key = names[i].getObject() + val = names[i+1].getObject() + if isinstance(val, DictionaryObject) and val.has_key('/D'): + val = val['/D'] + dest = self._buildDestination(key, val) + if dest != None: + retval[key] = dest + + if not tree.has_key("/Names") and not tree.has_key("/Kids"): + for key in tree.keys(): + if isinstance(tree[key], ArrayObject) and isinstance(tree[key][0], PdfObject): + dest = self._buildDestination(key, tree[key]) + if dest != None: + retval[key] = dest + + return retval + + ## + # Read-only property that accesses the {@link #PdfFileReader.getOutlines + # getOutlines} function. + #

+ # Stability: Added in v1.10, will exist for all future v1.x releases. + outlines = property(lambda self: self.getOutlines(), None, None) + + ## + # Retrieves the document outline present in the document. + #

+ # Stability: Added in v1.10, will exist for all future v1.x releases. + # @return Returns a nested list of {@link #Destination destinations}. + def getOutlines(self, node=None, outlines=None): + if outlines == None: + outlines = [] + catalog = self.trailer["/Root"] + + # get the outline dictionary and named destinations + if catalog.has_key("/Outlines"): + lines = catalog["/Outlines"] + if isinstance(lines, DictionaryObject) and lines.has_key("/First"): + node = lines["/First"] + self._namedDests = self.getNamedDestinations() + + if node == None: + return outlines + + # see if there are any more outlines + while 1: + outline = self._buildOutline(node) + if outline: + outlines.append(outline) + + # check for sub-outlines + if node.has_key("/First"): + subOutlines = [] + self.getOutlines(node["/First"], subOutlines) + if subOutlines: + outlines.append(subOutlines) + + if not node.has_key("/Next"): + break + node = node["/Next"] + + return outlines + + def _buildDestination(self, title, array, classname=Destination): + page, typ = array[0:2] + array = array[2:] + try: + rv = classname(title, page, typ, *array) + except utils.PdfReadError: + rv = None + warnings.warn("""Destination "%s" has unknown type: %r""" % (title, typ), utils.PdfReadWarning) + return rv + + + def _buildOutline(self, node): + dest, title, outline = None, None, None + + if node.has_key("/A") and node.has_key("/Title"): + # Action, section 8.5 (only type GoTo supported) + title = node["/Title"] + action = node["/A"] + if action["/S"] == "/GoTo": + dest = action["/D"] + elif node.has_key("/Dest") and node.has_key("/Title"): + # Destination, section 8.2.1 + title = node["/Title"] + dest = node["/Dest"] + + # if destination found, then create outline + if dest: + if isinstance(dest, ArrayObject): + outline = self._buildDestination(title, dest, Bookmark) + elif isinstance(dest, (unicode, NameObject)) and self._namedDests.has_key(dest): + outline = self._namedDests[dest] + outline[NameObject("/Title")] = title + else: + #raise utils.PdfReadError() + warnings.warn("Unexpected destination %r" % dest, utils.PdfReadWarning) + return None + return outline + + ## + # Read-only property that emulates a list based upon the {@link + # #PdfFileReader.getNumPages getNumPages} and {@link #PdfFileReader.getPage + # getPage} functions. + #

+ # Stability: Added in v1.7, and will exist for all future v1.x releases. + pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage), + None, None) + + def _flatten(self, pages=None, inherit=None): + inheritablePageAttributes = ( + NameObject("/Resources"), NameObject("/MediaBox"), + NameObject("/CropBox"), NameObject("/Rotate") + ) + if inherit == None: + inherit = dict() + if pages == None: + self.flattenedPages = [] + catalog = self.trailer["/Root"].getObject() + pages = catalog["/Pages"].getObject() + t = pages["/Type"] + if t == "/Pages": + for attr in inheritablePageAttributes: + if pages.has_key(attr): + inherit[attr] = pages[attr] + for page in pages["/Kids"]: + self._flatten(page.getObject(), inherit) + elif t == "/Page": + for attr,value in inherit.items(): + # if the page has it's own value, it does not inherit the + # parent's value: + if not pages.has_key(attr): + pages[attr] = value + pageObj = PageObject(self) + pageObj.update(pages) + self.flattenedPages.append(pageObj) + + def getObject(self, indirectReference): + retval = self.resolvedObjects.get(indirectReference.generation, {}).get(indirectReference.idnum, None) + if retval != None: + return retval + if indirectReference.generation == 0 and \ + self.xref_objStm.has_key(indirectReference.idnum): + # indirect reference to object in object stream + # read the entire object stream into memory + stmnum,idx = self.xref_objStm[indirectReference.idnum] + objStm = IndirectObject(stmnum, 0, self).getObject() + assert objStm['/Type'] == '/ObjStm' + assert idx < objStm['/N'] + streamData = StringIO(objStm.getData()) + for i in xrange(objStm['/N']): + objnum = NumberObject.readFromStream(streamData) + readNonWhitespace(streamData) + streamData.seek(-1, 1) + offset = NumberObject.readFromStream(streamData) + readNonWhitespace(streamData) + streamData.seek(-1, 1) + t = streamData.tell() + streamData.seek(objStm['/First']+offset, 0) + obj = readObject(streamData, self) + self.resolvedObjects[0][objnum] = obj + streamData.seek(t, 0) + return self.resolvedObjects[0][indirectReference.idnum] + start = self.xref[indirectReference.generation][indirectReference.idnum] + self.stream.seek(start, 0) + idnum = indirectReference.idnum + generation = indirectReference.generation + try: + idnum, generation = self.readObjectHeader(self.stream) + assert idnum == indirectReference.idnum + assert generation == indirectReference.generation + except InternalObjectException: + retval = NullObject() + except AssertionError: + retval = NullObject() + else: + retval = readObject(self.stream, self) + + # override encryption is used for the /Encrypt dictionary + if not self._override_encryption and self.isEncrypted: + # if we don't have the encryption key: + if not hasattr(self, '_decryption_key'): + raise Exception, "file has not been decrypted" + # otherwise, decrypt here... + import struct + pack1 = struct.pack("= 1 + assert generation >= 0 + except ValueError: + raise InternalObjectException("Non-numeric object id, xref table is probably incorrect") + except AssertionError: + raise InternalObjectException("Invalid object id, xref table is possibly incorrect") + return idnum, generation + + def cacheIndirectObject(self, generation, idnum, obj): + if not self.resolvedObjects.has_key(generation): + self.resolvedObjects[generation] = {} + self.resolvedObjects[generation][idnum] = obj + + def read(self, stream): + # start at the end: + stream.seek(-1, 2) + line = '' + while not line: + line = self.readNextEndLine(stream) + if line[:5] != "%%EOF": + raise utils.PdfReadError, "EOF marker not found" + + # find startxref entry - the location of the xref table + line = self.readNextEndLine(stream) + startxref = int(line) + line = self.readNextEndLine(stream) + if line[:9] != "startxref": + raise utils.PdfReadError, "startxref not found" + + # read all cross reference tables and their trailers + self.xref = {} + self.xref_objStm = {} + self.trailer = DictionaryObject() + while 1: + # load the xref table + stream.seek(startxref, 0) + x = stream.read(1) + if x == "x": + # standard cross-reference table + ref = stream.read(4) + if ref[:3] != "ref": + raise utils.PdfReadError, "xref table read error" + readNonWhitespace(stream) + stream.seek(-1, 1) + while 1: + num = readObject(stream, self) + readNonWhitespace(stream) + stream.seek(-1, 1) + size = readObject(stream, self) + readNonWhitespace(stream) + stream.seek(-1, 1) + cnt = 0 + while cnt < size: + line = stream.read(20) + # It's very clear in section 3.4.3 of the PDF spec + # that all cross-reference table lines are a fixed + # 20 bytes. However... some malformed PDF files + # use a single character EOL without a preceeding + # space. Detect that case, and seek the stream + # back one character. (0-9 means we've bled into + # the next xref entry, t means we've bled into the + # text "trailer"): + if line[-1] in "0123456789t": + stream.seek(-1, 1) + offset, generation = line[:16].split(" ") + offset, generation = int(offset), int(generation) + if not self.xref.has_key(generation): + self.xref[generation] = {} + if self.xref[generation].has_key(num): + # It really seems like we should allow the last + # xref table in the file to override previous + # ones. Since we read the file backwards, assume + # any existing key is already set correctly. + pass + else: + self.xref[generation][num] = offset + cnt += 1 + num += 1 + readNonWhitespace(stream) + stream.seek(-1, 1) + trailertag = stream.read(7) + if trailertag != "trailer": + # more xrefs! + stream.seek(-7, 1) + else: + break + readNonWhitespace(stream) + stream.seek(-1, 1) + newTrailer = readObject(stream, self) + for key, value in newTrailer.items(): + if not self.trailer.has_key(key): + self.trailer[key] = value + if newTrailer.has_key("/Prev"): + startxref = newTrailer["/Prev"] + else: + break + elif x.isdigit(): + # PDF 1.5+ Cross-Reference Stream + stream.seek(-1, 1) + idnum, generation = self.readObjectHeader(stream) + xrefstream = readObject(stream, self) + assert xrefstream["/Type"] == "/XRef" + self.cacheIndirectObject(generation, idnum, xrefstream) + streamData = StringIO(xrefstream.getData()) + idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) + entrySizes = xrefstream.get("/W") + for num, size in self._pairs(idx_pairs): + cnt = 0 + while cnt < size: + for i in xrange(len(entrySizes)): + d = streamData.read(entrySizes[i]) + di = convertToInt(d, entrySizes[i]) + if i == 0: + xref_type = di + elif i == 1: + if xref_type == 0: + next_free_object = di + elif xref_type == 1: + byte_offset = di + elif xref_type == 2: + objstr_num = di + elif i == 2: + if xref_type == 0: + next_generation = di + elif xref_type == 1: + generation = di + elif xref_type == 2: + obstr_idx = di + if xref_type == 0: + pass + elif xref_type == 1: + if not self.xref.has_key(generation): + self.xref[generation] = {} + if not num in self.xref[generation]: + self.xref[generation][num] = byte_offset + elif xref_type == 2: + if not num in self.xref_objStm: + self.xref_objStm[num] = [objstr_num, obstr_idx] + cnt += 1 + num += 1 + trailerKeys = "/Root", "/Encrypt", "/Info", "/ID" + for key in trailerKeys: + if xrefstream.has_key(key) and not self.trailer.has_key(key): + self.trailer[NameObject(key)] = xrefstream.raw_get(key) + if xrefstream.has_key("/Prev"): + startxref = xrefstream["/Prev"] + else: + break + else: + # bad xref character at startxref. Let's see if we can find + # the xref table nearby, as we've observed this error with an + # off-by-one before. + stream.seek(-11, 1) + tmp = stream.read(20) + xref_loc = tmp.find("xref") + if xref_loc != -1: + startxref -= (10 - xref_loc) + continue + else: + # no xref table found at specified location + assert False + break + + def _pairs(self, array): + i = 0 + while True: + yield array[i], array[i+1] + i += 2 + if (i+1) >= len(array): + break + + def readNextEndLine(self, stream): + line = "" + while True: + x = stream.read(1) + stream.seek(-2, 1) + if x == '\n' or x == '\r': + while x == '\n' or x == '\r': + x = stream.read(1) + stream.seek(-2, 1) + stream.seek(1, 1) + break + else: + line = x + line + return line + + ## + # When using an encrypted / secured PDF file with the PDF Standard + # encryption handler, this function will allow the file to be decrypted. + # It checks the given password against the document's user password and + # owner password, and then stores the resulting decryption key if either + # password is correct. + #

+ # It does not matter which password was matched. Both passwords provide + # the correct decryption key that will allow the document to be used with + # this library. + #

+ # Stability: Added in v1.8, will exist for all future v1.x releases. + # + # @return 0 if the password failed, 1 if the password matched the user + # password, and 2 if the password matched the owner password. + # + # @exception NotImplementedError Document uses an unsupported encryption + # method. + def decrypt(self, password): + self._override_encryption = True + try: + return self._decrypt(password) + finally: + self._override_encryption = False + + def _decrypt(self, password): + encrypt = self.trailer['/Encrypt'].getObject() + if encrypt['/Filter'] != '/Standard': + raise NotImplementedError, "only Standard PDF encryption handler is available" + if not (encrypt['/V'] in (1, 2)): + raise NotImplementedError, "only algorithm code 1 and 2 are supported" + user_password, key = self._authenticateUserPassword(password) + if user_password: + self._decryption_key = key + return 1 + else: + rev = encrypt['/R'].getObject() + if rev == 2: + keylen = 5 + else: + keylen = encrypt['/Length'].getObject() / 8 + key = _alg33_1(password, rev, keylen) + real_O = encrypt["/O"].getObject() + if rev == 2: + userpass = utils.RC4_encrypt(key, real_O) + else: + val = real_O + for i in xrange(19, -1, -1): + new_key = '' + for l in xrange(len(key)): + new_key += chr(ord(key[l]) ^ i) + val = utils.RC4_encrypt(new_key, val) + userpass = val + owner_password, key = self._authenticateUserPassword(userpass) + if owner_password: + self._decryption_key = key + return 2 + return 0 + + def _authenticateUserPassword(self, password): + encrypt = self.trailer['/Encrypt'].getObject() + rev = encrypt['/R'].getObject() + owner_entry = encrypt['/O'].getObject().original_bytes + p_entry = encrypt['/P'].getObject() + id_entry = self.trailer['/ID'].getObject() + id1_entry = id_entry[0].getObject() + if rev == 2: + U, key = _alg34(password, owner_entry, p_entry, id1_entry) + elif rev >= 3: + U, key = _alg35(password, rev, + encrypt["/Length"].getObject() / 8, owner_entry, + p_entry, id1_entry, + encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject()) + real_U = encrypt['/U'].getObject().original_bytes + return U == real_U, key + + def getIsEncrypted(self): + return self.trailer.has_key("/Encrypt") + + ## + # Read-only boolean property showing whether this PDF file is encrypted. + # Note that this property, if true, will remain true even after the {@link + # #PdfFileReader.decrypt decrypt} function is called. + isEncrypted = property(lambda self: self.getIsEncrypted(), None, None) + + +def getRectangle(self, name, defaults): + retval = self.get(name) + if isinstance(retval, RectangleObject): + return retval + if retval == None: + for d in defaults: + retval = self.get(d) + if retval != None: + break + if isinstance(retval, IndirectObject): + retval = self.pdf.getObject(retval) + retval = RectangleObject(retval) + setRectangle(self, name, retval) + return retval + +def setRectangle(self, name, value): + if not isinstance(name, NameObject): + name = NameObject(name) + self[name] = value + +def deleteRectangle(self, name): + del self[name] + +def createRectangleAccessor(name, fallback): + return \ + property( + lambda self: getRectangle(self, name, fallback), + lambda self, value: setRectangle(self, name, value), + lambda self: deleteRectangle(self, name) + ) + +## +# This class represents a single page within a PDF file. Typically this object +# will be created by accessing the {@link #PdfFileReader.getPage getPage} +# function of the {@link #PdfFileReader PdfFileReader} class, but it is +# also possible to create an empty page with the createBlankPage static +# method. +# @param pdf PDF file the page belongs to (optional, defaults to None). +class PageObject(DictionaryObject): + def __init__(self, pdf=None): + DictionaryObject.__init__(self) + self.pdf = pdf + + ## + # Returns a new blank page. + # If width or height is None, try to get the page size from the + # last page of pdf. If pdf is None or contains no page, a + # PageSizeNotDefinedError is raised. + # @param pdf PDF file the page belongs to + # @param width The width of the new page expressed in default user + # space units. + # @param height The height of the new page expressed in default user + # space units. + def createBlankPage(pdf=None, width=None, height=None): + page = PageObject(pdf) + + # Creates a new page (cf PDF Reference 7.7.3.3) + page.__setitem__(NameObject('/Type'), NameObject('/Page')) + page.__setitem__(NameObject('/Parent'), NullObject()) + page.__setitem__(NameObject('/Resources'), DictionaryObject()) + if width is None or height is None: + if pdf is not None and pdf.getNumPages() > 0: + lastpage = pdf.getPage(pdf.getNumPages() - 1) + width = lastpage.mediaBox.getWidth() + height = lastpage.mediaBox.getHeight() + else: + raise utils.PageSizeNotDefinedError() + page.__setitem__(NameObject('/MediaBox'), + RectangleObject([0, 0, width, height])) + + return page + createBlankPage = staticmethod(createBlankPage) + + ## + # Rotates a page clockwise by increments of 90 degrees. + #

+ # Stability: Added in v1.1, will exist for all future v1.x releases. + # @param angle Angle to rotate the page. Must be an increment of 90 deg. + def rotateClockwise(self, angle): + assert angle % 90 == 0 + self._rotate(angle) + return self + + ## + # Rotates a page counter-clockwise by increments of 90 degrees. + #

+ # Stability: Added in v1.1, will exist for all future v1.x releases. + # @param angle Angle to rotate the page. Must be an increment of 90 deg. + def rotateCounterClockwise(self, angle): + assert angle % 90 == 0 + self._rotate(-angle) + return self + + def _rotate(self, angle): + currentAngle = self.get("/Rotate", 0) + self[NameObject("/Rotate")] = NumberObject(currentAngle + angle) + + def _mergeResources(res1, res2, resource): + newRes = DictionaryObject() + newRes.update(res1.get(resource, DictionaryObject()).getObject()) + page2Res = res2.get(resource, DictionaryObject()).getObject() + renameRes = {} + for key in page2Res.keys(): + if newRes.has_key(key) and newRes[key] != page2Res[key]: + newname = NameObject(key + "renamed") + renameRes[key] = newname + newRes[newname] = page2Res[key] + elif not newRes.has_key(key): + newRes[key] = page2Res.raw_get(key) + return newRes, renameRes + _mergeResources = staticmethod(_mergeResources) + + def _contentStreamRename(stream, rename, pdf): + if not rename: + return stream + stream = ContentStream(stream, pdf) + for operands,operator in stream.operations: + for i in xrange(len(operands)): + op = operands[i] + if isinstance(op, NameObject): + operands[i] = rename.get(op, op) + return stream + _contentStreamRename = staticmethod(_contentStreamRename) + + def _pushPopGS(contents, pdf): + # adds a graphics state "push" and "pop" to the beginning and end + # of a content stream. This isolates it from changes such as + # transformation matricies. + stream = ContentStream(contents, pdf) + stream.operations.insert(0, [[], "q"]) + stream.operations.append([[], "Q"]) + return stream + _pushPopGS = staticmethod(_pushPopGS) + + def _addTransformationMatrix(contents, pdf, ctm): + # adds transformation matrix at the beginning of the given + # contents stream. + a, b, c, d, e, f = ctm + contents = ContentStream(contents, pdf) + contents.operations.insert(0, [[FloatObject(a), FloatObject(b), + FloatObject(c), FloatObject(d), FloatObject(e), + FloatObject(f)], " cm"]) + return contents + _addTransformationMatrix = staticmethod(_addTransformationMatrix) + + ## + # Returns the /Contents object, or None if it doesn't exist. + # /Contents is optionnal, as described in PDF Reference 7.7.3.3 + def getContents(self): + if self.has_key("/Contents"): + return self["/Contents"].getObject() + else: + return None + + ## + # Merges the content streams of two pages into one. Resource references + # (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc + # of this page are not altered. The parameter page's content stream will + # be added to the end of this page's content stream, meaning that it will + # be drawn after, or "on top" of this page. + #

+ # Stability: Added in v1.4, will exist for all future 1.x releases. + # @param page2 An instance of {@link #PageObject PageObject} to be merged + # into this one. + def mergePage(self, page2): + self._mergePage(page2) + + ## + # Actually merges the content streams of two pages into one. Resource + # references (i.e. fonts) are maintained from both pages. The + # mediabox/cropbox/etc of this page are not altered. The parameter page's + # content stream will be added to the end of this page's content stream, + # meaning that it will be drawn after, or "on top" of this page. + # + # @param page2 An instance of {@link #PageObject PageObject} to be merged + # into this one. + # @param page2transformation A fuction which applies a transformation to + # the content stream of page2. Takes: page2 + # contents stream. Must return: new contents + # stream. If omitted, the content stream will + # not be modified. + def _mergePage(self, page2, page2transformation=None): + # First we work on merging the resource dictionaries. This allows us + # to find out what symbols in the content streams we might need to + # rename. + + newResources = DictionaryObject() + rename = {} + originalResources = self["/Resources"].getObject() + page2Resources = page2["/Resources"].getObject() + + for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading", "/Properties": + new, newrename = PageObject._mergeResources(originalResources, page2Resources, res) + if new: + newResources[NameObject(res)] = new + rename.update(newrename) + + # Combine /ProcSet sets. + newResources[NameObject("/ProcSet")] = ArrayObject( + frozenset(originalResources.get("/ProcSet", ArrayObject()).getObject()).union( + frozenset(page2Resources.get("/ProcSet", ArrayObject()).getObject()) + ) + ) + + newContentArray = ArrayObject() + + originalContent = self.getContents() + if originalContent is not None: + newContentArray.append(PageObject._pushPopGS( + originalContent, self.pdf)) + + page2Content = page2.getContents() + if page2Content is not None: + if page2transformation is not None: + page2Content = page2transformation(page2Content) + page2Content = PageObject._contentStreamRename( + page2Content, rename, self.pdf) + page2Content = PageObject._pushPopGS(page2Content, self.pdf) + newContentArray.append(page2Content) + + self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf) + self[NameObject('/Resources')] = newResources + + ## + # This is similar to mergePage, but a transformation matrix is + # applied to the merged stream. + # + # @param page2 An instance of {@link #PageObject PageObject} to be merged. + # @param ctm A 6 elements tuple containing the operands of the + # transformation matrix + def mergeTransformedPage(self, page2, ctm): + self._mergePage(page2, lambda page2Content: + PageObject._addTransformationMatrix(page2Content, page2.pdf, ctm)) + + ## + # This is similar to mergePage, but the stream to be merged is scaled + # by appling a transformation matrix. + # + # @param page2 An instance of {@link #PageObject PageObject} to be merged. + # @param factor The scaling factor + def mergeScaledPage(self, page2, factor): + # CTM to scale : [ sx 0 0 sy 0 0 ] + return self.mergeTransformedPage(page2, [factor, 0, + 0, factor, + 0, 0]) + + ## + # This is similar to mergePage, but the stream to be merged is rotated + # by appling a transformation matrix. + # + # @param page2 An instance of {@link #PageObject PageObject} to be merged. + # @param rotation The angle of the rotation, in degrees + def mergeRotatedPage(self, page2, rotation): + rotation = math.radians(rotation) + return self.mergeTransformedPage(page2, + [math.cos(rotation), math.sin(rotation), + -math.sin(rotation), math.cos(rotation), + 0, 0]) + + ## + # This is similar to mergePage, but the stream to be merged is translated + # by appling a transformation matrix. + # + # @param page2 An instance of {@link #PageObject PageObject} to be merged. + # @param tx The translation on X axis + # @param tx The translation on Y axis + def mergeTranslatedPage(self, page2, tx, ty): + return self.mergeTransformedPage(page2, [1, 0, + 0, 1, + tx, ty]) + + ## + # This is similar to mergePage, but the stream to be merged is rotated + # and scaled by appling a transformation matrix. + # + # @param page2 An instance of {@link #PageObject PageObject} to be merged. + # @param rotation The angle of the rotation, in degrees + # @param factor The scaling factor + def mergeRotatedScaledPage(self, page2, rotation, scale): + rotation = math.radians(rotation) + rotating = [[math.cos(rotation), math.sin(rotation),0], + [-math.sin(rotation),math.cos(rotation), 0], + [0, 0, 1]] + scaling = [[scale,0, 0], + [0, scale,0], + [0, 0, 1]] + ctm = utils.matrixMultiply(rotating, scaling) + + return self.mergeTransformedPage(page2, + [ctm[0][0], ctm[0][1], + ctm[1][0], ctm[1][1], + ctm[2][0], ctm[2][1]]) + + ## + # This is similar to mergePage, but the stream to be merged is translated + # and scaled by appling a transformation matrix. + # + # @param page2 An instance of {@link #PageObject PageObject} to be merged. + # @param scale The scaling factor + # @param tx The translation on X axis + # @param tx The translation on Y axis + def mergeScaledTranslatedPage(self, page2, scale, tx, ty): + translation = [[1, 0, 0], + [0, 1, 0], + [tx,ty,1]] + scaling = [[scale,0, 0], + [0, scale,0], + [0, 0, 1]] + ctm = utils.matrixMultiply(scaling, translation) + + return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], + ctm[1][0], ctm[1][1], + ctm[2][0], ctm[2][1]]) + + ## + # This is similar to mergePage, but the stream to be merged is translated, + # rotated and scaled by appling a transformation matrix. + # + # @param page2 An instance of {@link #PageObject PageObject} to be merged. + # @param tx The translation on X axis + # @param ty The translation on Y axis + # @param rotation The angle of the rotation, in degrees + # @param scale The scaling factor + def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty): + translation = [[1, 0, 0], + [0, 1, 0], + [tx,ty,1]] + rotation = math.radians(rotation) + rotating = [[math.cos(rotation), math.sin(rotation),0], + [-math.sin(rotation),math.cos(rotation), 0], + [0, 0, 1]] + scaling = [[scale,0, 0], + [0, scale,0], + [0, 0, 1]] + ctm = utils.matrixMultiply(rotating, scaling) + ctm = utils.matrixMultiply(ctm, translation) + + return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], + ctm[1][0], ctm[1][1], + ctm[2][0], ctm[2][1]]) + + ## + # Applys a transformation matrix the page. + # + # @param ctm A 6 elements tuple containing the operands of the + # transformation matrix + def addTransformation(self, ctm): + originalContent = self.getContents() + if originalContent is not None: + newContent = PageObject._addTransformationMatrix( + originalContent, self.pdf, ctm) + newContent = PageObject._pushPopGS(newContent, self.pdf) + self[NameObject('/Contents')] = newContent + + ## + # Scales a page by the given factors by appling a transformation + # matrix to its content and updating the page size. + # + # @param sx The scaling factor on horizontal axis + # @param sy The scaling factor on vertical axis + def scale(self, sx, sy): + self.addTransformation([sx, 0, + 0, sy, + 0, 0]) + self.mediaBox = RectangleObject([ + float(self.mediaBox.getLowerLeft_x()) * sx, + float(self.mediaBox.getLowerLeft_y()) * sy, + float(self.mediaBox.getUpperRight_x()) * sx, + float(self.mediaBox.getUpperRight_y()) * sy]) + + ## + # Scales a page by the given factor by appling a transformation + # matrix to its content and updating the page size. + # + # @param factor The scaling factor + def scaleBy(self, factor): + self.scale(factor, factor) + + ## + # Scales a page to the specified dimentions by appling a + # transformation matrix to its content and updating the page size. + # + # @param width The new width + # @param height The new heigth + def scaleTo(self, width, height): + sx = width / (self.mediaBox.getUpperRight_x() - + self.mediaBox.getLowerLeft_x ()) + sy = height / (self.mediaBox.getUpperRight_y() - + self.mediaBox.getLowerLeft_x ()) + self.scale(sx, sy) + + ## + # Compresses the size of this page by joining all content streams and + # applying a FlateDecode filter. + #

+ # Stability: Added in v1.6, will exist for all future v1.x releases. + # However, it is possible that this function will perform no action if + # content stream compression becomes "automatic" for some reason. + def compressContentStreams(self): + content = self.getContents() + if content is not None: + if not isinstance(content, ContentStream): + content = ContentStream(content, self.pdf) + self[NameObject("/Contents")] = content.flateEncode() + + ## + # Locate all text drawing commands, in the order they are provided in the + # content stream, and extract the text. This works well for some PDF + # files, but poorly for others, depending on the generator used. This will + # be refined in the future. Do not rely on the order of text coming out of + # this function, as it will change if this function is made more + # sophisticated. + #

+ # Stability: Added in v1.7, will exist for all future v1.x releases. May + # be overhauled to provide more ordered text in the future. + # @return a unicode string object + def extractText(self): + text = u"" + content = self["/Contents"].getObject() + if not isinstance(content, ContentStream): + content = ContentStream(content, self.pdf) + # Note: we check all strings are TextStringObjects. ByteStringObjects + # are strings where the byte->string encoding was unknown, so adding + # them to the text here would be gibberish. + for operands,operator in content.operations: + if operator == "Tj": + _text = operands[0] + if isinstance(_text, TextStringObject): + text += _text + elif operator == "T*": + text += "\n" + elif operator == "'": + text += "\n" + _text = operands[0] + if isinstance(_text, TextStringObject): + text += operands[0] + elif operator == '"': + _text = operands[2] + if isinstance(_text, TextStringObject): + text += "\n" + text += _text + elif operator == "TJ": + for i in operands[0]: + if isinstance(i, TextStringObject): + text += i + return text + + ## + # A rectangle (RectangleObject), expressed in default user space units, + # defining the boundaries of the physical medium on which the page is + # intended to be displayed or printed. + #

+ # Stability: Added in v1.4, will exist for all future v1.x releases. + mediaBox = createRectangleAccessor("/MediaBox", ()) + + ## + # A rectangle (RectangleObject), expressed in default user space units, + # defining the visible region of default user space. When the page is + # displayed or printed, its contents are to be clipped (cropped) to this + # rectangle and then imposed on the output medium in some + # implementation-defined manner. Default value: same as MediaBox. + #

+ # Stability: Added in v1.4, will exist for all future v1.x releases. + cropBox = createRectangleAccessor("/CropBox", ("/MediaBox",)) + + ## + # A rectangle (RectangleObject), expressed in default user space units, + # defining the region to which the contents of the page should be clipped + # when output in a production enviroment. + #

+ # Stability: Added in v1.4, will exist for all future v1.x releases. + bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox")) + + ## + # A rectangle (RectangleObject), expressed in default user space units, + # defining the intended dimensions of the finished page after trimming. + #

+ # Stability: Added in v1.4, will exist for all future v1.x releases. + trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox")) + + ## + # A rectangle (RectangleObject), expressed in default user space units, + # defining the extent of the page's meaningful content as intended by the + # page's creator. + #

+ # Stability: Added in v1.4, will exist for all future v1.x releases. + artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox")) + + +class ContentStream(DecodedStreamObject): + def __init__(self, stream, pdf): + self.pdf = pdf + self.operations = [] + # stream may be a StreamObject or an ArrayObject containing + # multiple StreamObjects to be cat'd together. + stream = stream.getObject() + if isinstance(stream, ArrayObject): + data = "" + for s in stream: + data += s.getObject().getData() + stream = StringIO(data) + else: + stream = StringIO(stream.getData()) + self.__parseContentStream(stream) + + def __parseContentStream(self, stream): + # file("f:\\tmp.txt", "w").write(stream.read()) + stream.seek(0, 0) + operands = [] + while True: + peek = readNonWhitespace(stream) + if peek == '': + break + stream.seek(-1, 1) + if peek.isalpha() or peek == "'" or peek == '"': + operator = "" + while True: + tok = stream.read(1) + if tok.isspace() or tok in NameObject.delimiterCharacters: + stream.seek(-1, 1) + break + elif tok == '': + break + operator += tok + if operator == "BI": + # begin inline image - a completely different parsing + # mechanism is required, of course... thanks buddy... + assert operands == [] + ii = self._readInlineImage(stream) + self.operations.append((ii, "INLINE IMAGE")) + else: + self.operations.append((operands, operator)) + operands = [] + elif peek == '%': + # If we encounter a comment in the content stream, we have to + # handle it here. Typically, readObject will handle + # encountering a comment -- but readObject assumes that + # following the comment must be the object we're trying to + # read. In this case, it could be an operator instead. + while peek not in ('\r', '\n'): + peek = stream.read(1) + else: + operands.append(readObject(stream, None)) + + def _readInlineImage(self, stream): + # begin reading just after the "BI" - begin image + # first read the dictionary of settings. + settings = DictionaryObject() + while True: + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + if tok == "I": + # "ID" - begin of image data + break + key = readObject(stream, self.pdf) + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + value = readObject(stream, self.pdf) + settings[key] = value + # left at beginning of ID + tmp = stream.read(3) + assert tmp[:2] == "ID" + data = "" + while True: + tok = stream.read(1) + if tok == "E": + next = stream.read(1) + if next == "I": + break + else: + stream.seek(-1, 1) + data += tok + else: + data += tok + x = readNonWhitespace(stream) + stream.seek(-1, 1) + return {"settings": settings, "data": data} + + def _getData(self): + newdata = StringIO() + for operands,operator in self.operations: + if operator == "INLINE IMAGE": + newdata.write("BI") + dicttext = StringIO() + operands["settings"].writeToStream(dicttext, None) + newdata.write(dicttext.getvalue()[2:-2]) + newdata.write("ID ") + newdata.write(operands["data"]) + newdata.write("EI") + else: + for op in operands: + op.writeToStream(newdata, None) + newdata.write(" ") + newdata.write(operator) + newdata.write("\n") + return newdata.getvalue() + + def _setData(self, value): + self.__parseContentStream(StringIO(value)) + + _data = property(_getData, _setData) + + +## +# A class representing the basic document metadata provided in a PDF File. +#

+# As of pyPdf v1.10, all text properties of the document metadata have two +# properties, eg. author and author_raw. The non-raw property will always +# return a TextStringObject, making it ideal for a case where the metadata is +# being displayed. The raw property can sometimes return a ByteStringObject, +# if pyPdf was unable to decode the string's text encoding; this requires +# additional safety in the caller and therefore is not as commonly accessed. +class DocumentInformation(DictionaryObject): + def __init__(self): + DictionaryObject.__init__(self) + + def getText(self, key): + retval = self.get(key, None) + if isinstance(retval, TextStringObject): + return retval + return None + + ## + # Read-only property accessing the document's title. Added in v1.6, will + # exist for all future v1.x releases. Modified in v1.10 to always return a + # unicode string (TextStringObject). + # @return A unicode string, or None if the title is not provided. + title = property(lambda self: self.getText("/Title")) + title_raw = property(lambda self: self.get("/Title")) + + ## + # Read-only property accessing the document's author. Added in v1.6, will + # exist for all future v1.x releases. Modified in v1.10 to always return a + # unicode string (TextStringObject). + # @return A unicode string, or None if the author is not provided. + author = property(lambda self: self.getText("/Author")) + author_raw = property(lambda self: self.get("/Author")) + + ## + # Read-only property accessing the subject of the document. Added in v1.6, + # will exist for all future v1.x releases. Modified in v1.10 to always + # return a unicode string (TextStringObject). + # @return A unicode string, or None if the subject is not provided. + subject = property(lambda self: self.getText("/Subject")) + subject_raw = property(lambda self: self.get("/Subject")) + + ## + # Read-only property accessing the document's creator. If the document was + # converted to PDF from another format, the name of the application (for + # example, OpenOffice) that created the original document from which it was + # converted. Added in v1.6, will exist for all future v1.x releases. + # Modified in v1.10 to always return a unicode string (TextStringObject). + # @return A unicode string, or None if the creator is not provided. + creator = property(lambda self: self.getText("/Creator")) + creator_raw = property(lambda self: self.get("/Creator")) + + ## + # Read-only property accessing the document's producer. If the document + # was converted to PDF from another format, the name of the application + # (for example, OSX Quartz) that converted it to PDF. Added in v1.6, will + # exist for all future v1.x releases. Modified in v1.10 to always return a + # unicode string (TextStringObject). + # @return A unicode string, or None if the producer is not provided. + producer = property(lambda self: self.getText("/Producer")) + producer_raw = property(lambda self: self.get("/Producer")) + + + +def convertToInt(d, size): + if size > 8: + raise utils.PdfReadError("invalid size in convertToInt") + d = "\x00\x00\x00\x00\x00\x00\x00\x00" + d + d = d[-8:] + return struct.unpack(">q", d)[0] + +# ref: pdf1.8 spec section 3.5.2 algorithm 3.2 +_encryption_padding = '\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56' + \ + '\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c' + \ + '\xa9\xfe\x64\x53\x69\x7a' + +# Implementation of algorithm 3.2 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True): + # 1. Pad or truncate the password string to exactly 32 bytes. If the + # password string is more than 32 bytes long, use only its first 32 bytes; + # if it is less than 32 bytes long, pad it by appending the required number + # of additional bytes from the beginning of the padding string + # (_encryption_padding). + password = (password + _encryption_padding)[:32] + # 2. Initialize the MD5 hash function and pass the result of step 1 as + # input to this function. + import struct + m = md5(password) + # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash + # function. + m.update(owner_entry) + # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass + # these bytes to the MD5 hash function, low-order byte first. + p_entry = struct.pack('= 3 and not metadata_encrypt: + m.update("\xff\xff\xff\xff") + # 7. Finish the hash. + md5_hash = m.digest() + # 8. (Revision 3 or greater) Do the following 50 times: Take the output + # from the previous MD5 hash and pass the first n bytes of the output as + # input into a new MD5 hash, where n is the number of bytes of the + # encryption key as defined by the value of the encryption dictionary's + # /Length entry. + if rev >= 3: + for i in xrange(50): + md5_hash = md5(md5_hash[:keylen]).digest() + # 9. Set the encryption key to the first n bytes of the output from the + # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or + # greater, depends on the value of the encryption dictionary's /Length + # entry. + return md5_hash[:keylen] + +# Implementation of algorithm 3.3 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg33(owner_pwd, user_pwd, rev, keylen): + # steps 1 - 4 + key = _alg33_1(owner_pwd, rev, keylen) + # 5. Pad or truncate the user password string as described in step 1 of + # algorithm 3.2. + user_pwd = (user_pwd + _encryption_padding)[:32] + # 6. Encrypt the result of step 5, using an RC4 encryption function with + # the encryption key obtained in step 4. + val = utils.RC4_encrypt(key, user_pwd) + # 7. (Revision 3 or greater) Do the following 19 times: Take the output + # from the previous invocation of the RC4 function and pass it as input to + # a new invocation of the function; use an encryption key generated by + # taking each byte of the encryption key obtained in step 4 and performing + # an XOR operation between that byte and the single-byte value of the + # iteration counter (from 1 to 19). + if rev >= 3: + for i in xrange(1, 20): + new_key = '' + for l in xrange(len(key)): + new_key += chr(ord(key[l]) ^ i) + val = utils.RC4_encrypt(new_key, val) + # 8. Store the output from the final invocation of the RC4 as the value of + # the /O entry in the encryption dictionary. + return val + +# Steps 1-4 of algorithm 3.3 +def _alg33_1(password, rev, keylen): + # 1. Pad or truncate the owner password string as described in step 1 of + # algorithm 3.2. If there is no owner password, use the user password + # instead. + password = (password + _encryption_padding)[:32] + # 2. Initialize the MD5 hash function and pass the result of step 1 as + # input to this function. + m = md5(password) + # 3. (Revision 3 or greater) Do the following 50 times: Take the output + # from the previous MD5 hash and pass it as input into a new MD5 hash. + md5_hash = m.digest() + if rev >= 3: + for i in xrange(50): + md5_hash = md5(md5_hash).digest() + # 4. Create an RC4 encryption key using the first n bytes of the output + # from the final MD5 hash, where n is always 5 for revision 2 but, for + # revision 3 or greater, depends on the value of the encryption + # dictionary's /Length entry. + key = md5_hash[:keylen] + return key + +# Implementation of algorithm 3.4 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg34(password, owner_entry, p_entry, id1_entry): + # 1. Create an encryption key based on the user password string, as + # described in algorithm 3.2. + key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry) + # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2, + # using an RC4 encryption function with the encryption key from the + # preceding step. + U = utils.RC4_encrypt(key, _encryption_padding) + # 3. Store the result of step 2 as the value of the /U entry in the + # encryption dictionary. + return U, key + +# Implementation of algorithm 3.4 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt): + # 1. Create an encryption key based on the user password string, as + # described in Algorithm 3.2. + key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) + # 2. Initialize the MD5 hash function and pass the 32-byte padding string + # shown in step 1 of Algorithm 3.2 as input to this function. + m = md5() + m.update(_encryption_padding) + # 3. Pass the first element of the file's file identifier array (the value + # of the ID entry in the document's trailer dictionary; see Table 3.13 on + # page 73) to the hash function and finish the hash. (See implementation + # note 25 in Appendix H.) + m.update(id1_entry) + md5_hash = m.digest() + # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption + # function with the encryption key from step 1. + val = utils.RC4_encrypt(key, md5_hash) + # 5. Do the following 19 times: Take the output from the previous + # invocation of the RC4 function and pass it as input to a new invocation + # of the function; use an encryption key generated by taking each byte of + # the original encryption key (obtained in step 2) and performing an XOR + # operation between that byte and the single-byte value of the iteration + # counter (from 1 to 19). + for i in xrange(1, 20): + new_key = '' + for l in xrange(len(key)): + new_key += chr(ord(key[l]) ^ i) + val = utils.RC4_encrypt(new_key, val) + # 6. Append 16 bytes of arbitrary padding to the output from the final + # invocation of the RC4 function and store the 32-byte result as the value + # of the U entry in the encryption dictionary. + # (implementator note: I don't know what "arbitrary padding" is supposed to + # mean, so I have used null bytes. This seems to match a few other + # people's implementations) + return val + ('\x00' * 16), key + +#if __name__ == "__main__": +# output = PdfFileWriter() +# +# input1 = PdfFileReader(file("test\\5000-s1-05e.pdf", "rb")) +# page1 = input1.getPage(0) +# +# input2 = PdfFileReader(file("test\\PDFReference16.pdf", "rb")) +# page2 = input2.getPage(0) +# page3 = input2.getPage(1) +# page1.mergePage(page2) +# page1.mergePage(page3) +# +# input3 = PdfFileReader(file("test\\cc-cc.pdf", "rb")) +# page1.mergePage(input3.getPage(0)) +# +# page1.compressContentStreams() +# +# output.addPage(page1) +# output.write(file("test\\merge-test.pdf", "wb")) + + diff --git a/PyPDF2/utils.py b/PyPDF2/utils.py new file mode 100644 index 0000000..3fcd5b0 --- /dev/null +++ b/PyPDF2/utils.py @@ -0,0 +1,125 @@ +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +""" +Utility functions for PDF library. +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +#ENABLE_PSYCO = False +#if ENABLE_PSYCO: +# try: +# import psyco +# except ImportError: +# ENABLE_PSYCO = False +# +#if not ENABLE_PSYCO: +# class psyco: +# def proxy(func): +# return func +# proxy = staticmethod(proxy) + +def readUntilWhitespace(stream, maxchars=None): + txt = "" + while True: + tok = stream.read(1) + if tok.isspace() or not tok: + break + txt += tok + if len(txt) == maxchars: + break + return txt + +def readNonWhitespace(stream): + tok = ' ' + while tok == '\n' or tok == '\r' or tok == ' ' or tok == '\t': + tok = stream.read(1) + return tok + +class ConvertFunctionsToVirtualList(object): + def __init__(self, lengthFunction, getFunction): + self.lengthFunction = lengthFunction + self.getFunction = getFunction + + def __len__(self): + return self.lengthFunction() + + def __getitem__(self, index): + if not isinstance(index, int): + raise TypeError, "sequence indices must be integers" + len_self = len(self) + if index < 0: + # support negative indexes + index = len_self + index + if index < 0 or index >= len_self: + raise IndexError, "sequence index out of range" + return self.getFunction(index) + +def RC4_encrypt(key, plaintext): + S = [i for i in range(256)] + j = 0 + for i in range(256): + j = (j + S[i] + ord(key[i % len(key)])) % 256 + S[i], S[j] = S[j], S[i] + i, j = 0, 0 + retval = "" + for x in range(len(plaintext)): + i = (i + 1) % 256 + j = (j + S[i]) % 256 + S[i], S[j] = S[j], S[i] + t = S[(S[i] + S[j]) % 256] + retval += chr(ord(plaintext[x]) ^ t) + return retval + +def matrixMultiply(a, b): + return [[sum([float(i)*float(j) + for i, j in zip(row, col)] + ) for col in zip(*b)] + for row in a] + +class PyPdfError(Exception): + pass + +class PdfReadError(PyPdfError): + pass + +class PageSizeNotDefinedError(PyPdfError): + pass + +class PdfReadWarning(UserWarning): + pass + +if __name__ == "__main__": + # test RC4 + out = RC4_encrypt("Key", "Plaintext") + print repr(out) + pt = RC4_encrypt("Key", out) + print repr(pt) diff --git a/PyPDF2/xmp.py b/PyPDF2/xmp.py new file mode 100644 index 0000000..3aadc85 --- /dev/null +++ b/PyPDF2/xmp.py @@ -0,0 +1,355 @@ +import re +import datetime +import decimal +from generic import PdfObject +from xml.dom import getDOMImplementation +from xml.dom.minidom import parseString + +RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" +DC_NAMESPACE = "http://purl.org/dc/elements/1.1/" +XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/" +PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/" +XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/" + +# What is the PDFX namespace, you might ask? I might ask that too. It's +# a completely undocumented namespace used to place "custom metadata" +# properties, which are arbitrary metadata properties with no semantic or +# documented meaning. Elements in the namespace are key/value-style storage, +# where the element name is the key and the content is the value. The keys +# are transformed into valid XML identifiers by substituting an invalid +# identifier character with \u2182 followed by the unicode hex ID of the +# original character. A key like "my car" is therefore "my\u21820020car". +# +# \u2182, in case you're wondering, is the unicode character +# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for +# escaping characters. +# +# Intentional users of the pdfx namespace should be shot on sight. A +# custom data schema and sensical XML elements could be used instead, as is +# suggested by Adobe's own documentation on XMP (under "Extensibility of +# Schemas"). +# +# Information presented here on the /pdfx/ schema is a result of limited +# reverse engineering, and does not constitute a full specification. +PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/" + +iso8601 = re.compile(""" + (?P[0-9]{4}) + (- + (?P[0-9]{2}) + (- + (?P[0-9]+) + (T + (?P[0-9]{2}): + (?P[0-9]{2}) + (:(?P[0-9]{2}(.[0-9]+)?))? + (?PZ|[-+][0-9]{2}:[0-9]{2}) + )? + )? + )? + """, re.VERBOSE) + +## +# An object that represents Adobe XMP metadata. +class XmpInformation(PdfObject): + + def __init__(self, stream): + self.stream = stream + docRoot = parseString(self.stream.getData()) + self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0] + self.cache = {} + + def writeToStream(self, stream, encryption_key): + self.stream.writeToStream(stream, encryption_key) + + def getElement(self, aboutUri, namespace, name): + for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): + if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri: + attr = desc.getAttributeNodeNS(namespace, name) + if attr != None: + yield attr + for element in desc.getElementsByTagNameNS(namespace, name): + yield element + + def getNodesInNamespace(self, aboutUri, namespace): + for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): + if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri: + for i in range(desc.attributes.length): + attr = desc.attributes.item(i) + if attr.namespaceURI == namespace: + yield attr + for child in desc.childNodes: + if child.namespaceURI == namespace: + yield child + + def _getText(self, element): + text = "" + for child in element.childNodes: + if child.nodeType == child.TEXT_NODE: + text += child.data + return text + + def _converter_string(value): + return value + + def _converter_date(value): + m = iso8601.match(value) + year = int(m.group("year")) + month = int(m.group("month") or "1") + day = int(m.group("day") or "1") + hour = int(m.group("hour") or "0") + minute = int(m.group("minute") or "0") + second = decimal.Decimal(m.group("second") or "0") + seconds = second.to_integral(decimal.ROUND_FLOOR) + milliseconds = (second - seconds) * 1000000 + tzd = m.group("tzd") or "Z" + dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds) + if tzd != "Z": + tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")] + tzd_hours *= -1 + if tzd_hours < 0: + tzd_minutes *= -1 + dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes) + return dt + _test_converter_date = staticmethod(_converter_date) + + def _getter_bag(namespace, name, converter): + def get(self): + cached = self.cache.get(namespace, {}).get(name) + if cached: + return cached + retval = [] + for element in self.getElement("", namespace, name): + bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag") + if len(bags): + for bag in bags: + for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"): + value = self._getText(item) + value = converter(value) + retval.append(value) + ns_cache = self.cache.setdefault(namespace, {}) + ns_cache[name] = retval + return retval + return get + + def _getter_seq(namespace, name, converter): + def get(self): + cached = self.cache.get(namespace, {}).get(name) + if cached: + return cached + retval = [] + for element in self.getElement("", namespace, name): + seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq") + if len(seqs): + for seq in seqs: + for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"): + value = self._getText(item) + value = converter(value) + retval.append(value) + else: + value = converter(self._getText(element)) + retval.append(value) + ns_cache = self.cache.setdefault(namespace, {}) + ns_cache[name] = retval + return retval + return get + + def _getter_langalt(namespace, name, converter): + def get(self): + cached = self.cache.get(namespace, {}).get(name) + if cached: + return cached + retval = {} + for element in self.getElement("", namespace, name): + alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt") + if len(alts): + for alt in alts: + for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"): + value = self._getText(item) + value = converter(value) + retval[item.getAttribute("xml:lang")] = value + else: + retval["x-default"] = converter(self._getText(element)) + ns_cache = self.cache.setdefault(namespace, {}) + ns_cache[name] = retval + return retval + return get + + def _getter_single(namespace, name, converter): + def get(self): + cached = self.cache.get(namespace, {}).get(name) + if cached: + return cached + value = None + for element in self.getElement("", namespace, name): + if element.nodeType == element.ATTRIBUTE_NODE: + value = element.nodeValue + else: + value = self._getText(element) + break + if value != None: + value = converter(value) + ns_cache = self.cache.setdefault(namespace, {}) + ns_cache[name] = value + return value + return get + + ## + # Contributors to the resource (other than the authors). An unsorted + # array of names. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string)) + + ## + # Text describing the extent or scope of the resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string)) + + ## + # A sorted array of names of the authors of the resource, listed in order + # of precedence. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string)) + + ## + # A sorted array of dates (datetime.datetime instances) of signifigance to + # the resource. The dates and times are in UTC. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date)) + + ## + # A language-keyed dictionary of textual descriptions of the content of the + # resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string)) + + ## + # The mime-type of the resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string)) + + ## + # Unique identifier of the resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string)) + + ## + # An unordered array specifying the languages used in the resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string)) + + ## + # An unordered array of publisher names. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string)) + + ## + # An unordered array of text descriptions of relationships to other + # documents. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string)) + + ## + # A language-keyed dictionary of textual descriptions of the rights the + # user has to this resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string)) + + ## + # Unique identifier of the work from which this resource was derived. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string)) + + ## + # An unordered array of descriptive phrases or keywrods that specify the + # topic of the content of the resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string)) + + ## + # A language-keyed dictionary of the title of the resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string)) + + ## + # An unordered array of textual descriptions of the document type. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string)) + + ## + # An unformatted text string representing document keywords. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string)) + + ## + # The PDF file version, for example 1.0, 1.3. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string)) + + ## + # The name of the tool that created the PDF document. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string)) + + ## + # The date and time the resource was originally created. The date and + # time are returned as a UTC datetime.datetime object. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date)) + + ## + # The date and time the resource was last modified. The date and time + # are returned as a UTC datetime.datetime object. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date)) + + ## + # The date and time that any metadata for this resource was last + # changed. The date and time are returned as a UTC datetime.datetime + # object. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date)) + + ## + # The name of the first known tool used to create the resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string)) + + ## + # The common identifier for all versions and renditions of this resource. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string)) + + ## + # An identifier for a specific incarnation of a document, updated each + # time a file is saved. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string)) + + def custom_properties(self): + if not hasattr(self, "_custom_properties"): + self._custom_properties = {} + for node in self.getNodesInNamespace("", PDFX_NAMESPACE): + key = node.localName + while True: + # see documentation about PDFX_NAMESPACE earlier in file + idx = key.find(u"\u2182") + if idx == -1: + break + key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:] + if node.nodeType == node.ATTRIBUTE_NODE: + value = node.nodeValue + else: + value = self._getText(node) + self._custom_properties[key] = value + return self._custom_properties + + ## + # Retrieves custom metadata properties defined in the undocumented pdfx + # metadata schema. + #

Stability: Added in v1.12, will exist for all future v1.x releases. + # @return Returns a dictionary of key/value items for custom metadata + # properties. + custom_properties = property(custom_properties) + + diff --git a/README b/README new file mode 100644 index 0000000..3d7947a --- /dev/null +++ b/README @@ -0,0 +1,38 @@ +Example: + + from pyPdf import PdfFileWriter, PdfFileReader + + output = PdfFileWriter() + input1 = PdfFileReader(file("document1.pdf", "rb")) + + # add page 1 from input1 to output document, unchanged + output.addPage(input1.getPage(0)) + + # add page 2 from input1, but rotated clockwise 90 degrees + output.addPage(input1.getPage(1).rotateClockwise(90)) + + # add page 3 from input1, rotated the other way: + output.addPage(input1.getPage(2).rotateCounterClockwise(90)) + # alt: output.addPage(input1.getPage(2).rotateClockwise(270)) + + # add page 4 from input1, but first add a watermark from another pdf: + page4 = input1.getPage(3) + watermark = PdfFileReader(file("watermark.pdf", "rb")) + page4.mergePage(watermark.getPage(0)) + + # add page 5 from input1, but crop it to half size: + page5 = input1.getPage(4) + page5.mediaBox.upperRight = ( + page5.mediaBox.getUpperRight_x() / 2, + page5.mediaBox.getUpperRight_y() / 2 + ) + output.addPage(page5) + + # print how many pages input1 has: + print "document1.pdf has %s pages." % input1.getNumPages()) + + # finally, write "output" to document-output.pdf + outputStream = file("document-output.pdf", "wb") + output.write(outputStream) + + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..291bfe7 --- /dev/null +++ b/setup.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +from distutils.core import setup + +long_description = """ +A Pure-Python library built as a PDF toolkit. It is capable of: + +- extracting document information (title, author, ...), +- splitting documents page by page, +- merging documents page by page, +- cropping pages, +- merging multiple pages into a single page, +- encrypting and decrypting PDF files. + +By being Pure-Python, it should run on any Python platform without any +dependencies on external libraries. It can also work entirely on StringIO +objects rather than file streams, allowing for PDF manipulation in memory. +It is therefore a useful tool for websites that manage or manipulate PDFs. +""" + +setup( + name="pyPdf", + version="1.12", + description="PDF toolkit", + long_description=long_description, + author="Mathieu Fenniak", + author_email="biziqe@mathieu.fenniak.net", + url="http://pybrary.net/pyPdf/", + download_url="http://pybrary.net/pyPdf/pyPdf-1.12.tar.gz", + classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python", + "Operating System :: OS Independent", + "Topic :: Software Development :: Libraries :: Python Modules", + ], + packages=["pyPdf"], + ) +