Add CCITTFax Decode and JPEG test

This commit is contained in:
Sylvain Pelissier 2016-01-21 13:42:17 +01:00
parent efae6bcae6
commit 1273824c0f
6 changed files with 78 additions and 7 deletions

View File

@ -331,10 +331,51 @@ class ASCII85Decode(object):
return bytes(out)
decode = staticmethod(decode)
class DCTDecode(object):
def decode(data, decodeParms=None):
return data
decode = staticmethod(decode)
class JPXDecode(object):
def decode(data, decodeParms=None):
return data
decode = staticmethod(decode)
class CCITTFaxDecode(object):
def decode(data, decodeParms=None, height=0):
if decodeParms:
if decodeParms.get("/K", 1) == -1:
CCITTgroup = 4
else:
CCITTgroup = 3
width = decodeParms["/Columns"]
imgSize = len(data)
tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h'
tiffHeader = struct.pack(tiff_header_struct,
b'II', # Byte order indication: Little endian
42, # Version number (always 42)
8, # Offset to first IFD
8, # Number of tags in IFD
256, 4, 1, width, # ImageWidth, LONG, 1, width
257, 4, 1, height, # ImageLength, LONG, 1, length
258, 3, 1, 1, # BitsPerSample, SHORT, 1, 1
259, 3, 1, CCITTgroup, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
262, 3, 1, 0, # Thresholding, SHORT, 1, 0 = WhiteIsZero
273, 4, 1, struct.calcsize(tiff_header_struct), # StripOffsets, LONG, 1, length of header
278, 4, 1, height, # RowsPerStrip, LONG, 1, length
279, 4, 1, imgSize, # StripByteCounts, LONG, 1, size of image
0 # last IFD
)
return tiffHeader + data
decode = staticmethod(decode)
def decodeStreamData(stream):
from .generic import NameObject
filters = stream.get("/Filter", ())
if len(filters) and not isinstance(filters[0], NameObject):
# we have a single filter instance
filters = (filters,)
@ -350,9 +391,13 @@ def decodeStreamData(stream):
data = LZWDecode.decode(data, stream.get("/DecodeParms"))
elif filterType == "/ASCII85Decode" or filterType == "/A85":
data = ASCII85Decode.decode(data)
elif filterType == "/DCTDecode" or filterType == "/JPXDecode":
#return raw data for jpg or jpeg2000 image
pass
elif filterType == "/DCTDecode":
data = DCTDecode.decode(data)
elif filterType == "/JPXDecode":
data = JPXDecode.decode(data)
elif filterType == "/CCITTFaxDecode":
height = stream.get("/Height", ())
data = CCITTFaxDecode.decode(data, stream.get("/DecodeParms"), height)
elif filterType == "/Crypt":
decodeParams = stream.get("/DecodeParams", {})
if "/Name" not in decodeParams and "/Type" not in decodeParams:

BIN
Resources/jpeg.pdf Normal file

Binary file not shown.

1
Resources/jpeg.txt Normal file

File diff suppressed because one or more lines are too long

View File

@ -17,14 +17,12 @@ pdf = sys.argv[1]
if __name__ == '__main__':
input1 = PyPDF2.PdfFileReader(open(pdf, "rb"))
page0 = input1.getPage(2)
page0 = input1.getPage(30)
if '/XObject' in page0['/Resources']:
xObject = page0['/Resources']['/XObject'].getObject()
print(xObject)
for obj in xObject:
print(xObject[obj])
if xObject[obj]['/Subtype'] == '/Image':
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
data = xObject[obj].getData()
@ -45,6 +43,10 @@ if __name__ == '__main__':
img = open(obj[1:] + ".jp2", "wb")
img.write(data)
img.close()
elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
img = open(obj[1:] + ".tiff", "wb")
img.write(data)
img.close()
else:
img = Image.frombytes(mode, size, data)
img.save(obj[1:] + ".png")

View File

@ -1,6 +1,7 @@
import os
import sys
import unittest
import binascii
from PyPDF2 import PdfFileReader, PdfFileWriter
@ -37,6 +38,28 @@ class PdfReaderTestCases(unittest.TestCase):
msg='PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n'
% (pdftext, ipdf_p1_text))
def test_PdfReaderJpegImage(self):
'''
Test loading and parsing of a file. Extract the image of the file and compare to expected
textual output. Expected outcome: file loads, image matches expected.
'''
with open(os.path.join(RESOURCE_ROOT, 'jpeg.pdf'), 'rb') as inputfile:
# Load PDF file from file
ipdf = PdfFileReader(inputfile)
# Retrieve the text of the image
with open(os.path.join(RESOURCE_ROOT, 'jpeg.txt'), 'r') as pdftext_file:
imagetext = pdftext_file.read()
ipdf_p0 = ipdf.getPage(0)
xObject = ipdf_p0['/Resources']['/XObject'].getObject()
data = xObject['/Im4'].getData()
# Compare the text of the PDF to a known source
self.assertEqual(binascii.hexlify(data), imagetext,
msg='PDF extracted image differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n'
% (imagetext, binascii.hexlify(data)))
class AddJsTestCase(unittest.TestCase):