debian-pdfrw/pdfrw/uncompress.py

118 lines
4.5 KiB
Python

# A part of pdfrw (https://github.com/pmaupin/pdfrw)
# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
# Copyright (C) 2012-2015 Nerijus Mika
# MIT license -- See LICENSE.txt for details
# Copyright (c) 2006, Mathieu Fenniak
# BSD license -- see LICENSE.txt for details
'''
A small subset of decompression filters. Should add more later.
I believe, after looking at the code, that portions of the flate
PNG predictor were originally transcribed from PyPDF2, which is
probably an excellent source of additional filters.
'''
import array
from .objects import PdfDict, PdfName, PdfArray
from .errors import log
from .py23_diffs import zlib, xrange, from_array, convert_load, convert_store
def streamobjects(mylist, isinstance=isinstance, PdfDict=PdfDict):
for obj in mylist:
if isinstance(obj, PdfDict) and obj.stream is not None:
yield obj
# Hack so we can import if zlib not available
decompressobj = zlib if zlib is None else zlib.decompressobj
def uncompress(mylist, leave_raw=False, warnings=set(),
flate=PdfName.FlateDecode, decompress=decompressobj,
isinstance=isinstance, list=list, len=len):
ok = True
for obj in streamobjects(mylist):
ftype = obj.Filter
if ftype is None:
continue
if isinstance(ftype, list) and len(ftype) == 1:
# todo: multiple filters
ftype = ftype[0]
parms = obj.DecodeParms or obj.DP
if ftype != flate:
msg = ('Not decompressing: cannot use filter %s'
' with parameters %s') % (repr(ftype), repr(parms))
if msg not in warnings:
warnings.add(msg)
log.warning(msg)
ok = False
else:
dco = decompress()
try:
data = dco.decompress(convert_store(obj.stream))
except Exception as s:
error = str(s)
else:
error = None
if isinstance(parms, PdfArray):
oldparms = parms
parms = PdfDict()
for x in oldparms:
parms.update(x)
if parms:
predictor = int(parms.Predictor or 1)
columns = int(parms.Columns or 1)
colors = int(parms.Colors or 1)
bpc = int(parms.BitsPerComponent or 8)
if 10 <= predictor <= 15:
data, error = flate_png(data, predictor, columns, colors, bpc)
elif predictor != 1:
error = ('Unsupported flatedecode predictor %s' %
repr(predictor))
if error is None:
assert not dco.unconsumed_tail
if dco.unused_data.strip():
error = ('Unconsumed compression data: %s' %
repr(dco.unused_data[:20]))
if error is None:
obj.Filter = None
obj.stream = data if leave_raw else convert_load(data)
else:
log.error('%s %s' % (error, repr(obj.indirect)))
ok = False
return ok
def flate_png(data, predictor=1, columns=1, colors=1, bpc=8):
''' PNG prediction is used to make certain kinds of data
more compressible. Before the compression, each data
byte is either left the same, or is set to be a delta
from the previous byte, or is set to be a delta from
the previous row. This selection is done on a per-row
basis, and is indicated by a compression type byte
prepended to each row of data.
Within more recent PDF files, it is normal to use
this technique for Xref stream objects, which are
quite regular.
'''
columnbytes = ((columns * colors * bpc) + 7) // 8
data = array.array('B', data)
rowlen = columnbytes + 1
if predictor == 15:
padding = (rowlen - len(data)) % rowlen
data.extend([0] * padding)
assert len(data) % rowlen == 0
rows = xrange(0, len(data), rowlen)
for row_index in rows:
offset = data[row_index]
if offset >= 2:
if offset > 2:
return None, 'Unsupported PNG filter %d' % offset
offset = rowlen if row_index else 0
if offset:
for index in xrange(row_index + 1, row_index + rowlen):
data[index] = (data[index] + data[index - offset]) % 256
for row_index in reversed(rows):
data.pop(row_index)
return from_array(data), None