Add CCITTFax Decode and JPEG test

2016-01-21 13:42:17 +01:00 · 2016-01-21 13:42:17 +01:00 · 1273824c0f
parent efae6bcae6
commit 1273824c0f
6 changed files with 78 additions and 7 deletions
--- a/PDF_Samples/Seige_of_Vicksburg_Sample_OCR.pdf
+++ b/PDF_Samples/Seige_of_Vicksburg_Sample_OCR.pdf
--- a/PyPDF2/filters.py
+++ b/PyPDF2/filters.py
@ -331,10 +331,51 @@ class ASCII85Decode(object):
            return bytes(out)
    decode = staticmethod(decode)

-
+class DCTDecode(object):
+    def decode(data, decodeParms=None):
+        return data
+    decode = staticmethod(decode)
+    
+class JPXDecode(object):
+    def decode(data, decodeParms=None):
+        return data
+    decode = staticmethod(decode)
+    
+class CCITTFaxDecode(object):   
+    def decode(data, decodeParms=None, height=0):
+        if decodeParms:
+            if decodeParms.get("/K", 1) == -1:
+                CCITTgroup = 4
+            else:
+                CCITTgroup = 3
+        
+        width = decodeParms["/Columns"]
+        imgSize = len(data)
+        tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h'
+        tiffHeader = struct.pack(tiff_header_struct,
+                           b'II',  # Byte order indication: Little endian
+                           42,  # Version number (always 42)
+                           8,  # Offset to first IFD
+                           8,  # Number of tags in IFD
+                           256, 4, 1, width,  # ImageWidth, LONG, 1, width
+                           257, 4, 1, height,  # ImageLength, LONG, 1, length
+                           258, 3, 1, 1,  # BitsPerSample, SHORT, 1, 1
+                           259, 3, 1, CCITTgroup,  # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
+                           262, 3, 1, 0,  # Thresholding, SHORT, 1, 0 = WhiteIsZero
+                           273, 4, 1, struct.calcsize(tiff_header_struct),  # StripOffsets, LONG, 1, length of header
+                           278, 4, 1, height,  # RowsPerStrip, LONG, 1, length
+                           279, 4, 1, imgSize,  # StripByteCounts, LONG, 1, size of image
+                           0  # last IFD
+                           )
+        
+        return tiffHeader + data
+    
+    decode = staticmethod(decode)
+    
 def decodeStreamData(stream):
    from .generic import NameObject
    filters = stream.get("/Filter", ())
+
    if len(filters) and not isinstance(filters[0], NameObject):
        # we have a single filter instance
        filters = (filters,)
@ -350,9 +391,13 @@ def decodeStreamData(stream):
                data = LZWDecode.decode(data, stream.get("/DecodeParms"))
            elif filterType == "/ASCII85Decode" or filterType == "/A85":
                data = ASCII85Decode.decode(data)
-            elif filterType == "/DCTDecode" or filterType == "/JPXDecode":
-                #return raw data for jpg or jpeg2000 image
-                pass
+            elif filterType == "/DCTDecode":
+                data = DCTDecode.decode(data)
+            elif filterType == "/JPXDecode":
+                data = JPXDecode.decode(data)
+            elif filterType == "/CCITTFaxDecode":
+                height = stream.get("/Height", ())
+                data = CCITTFaxDecode.decode(data, stream.get("/DecodeParms"), height)
            elif filterType == "/Crypt":
                decodeParams = stream.get("/DecodeParams", {})
                if "/Name" not in decodeParams and "/Type" not in decodeParams:
--- a/Resources/jpeg.pdf
+++ b/Resources/jpeg.pdf
--- a/Resources/jpeg.txt
+++ b/Resources/jpeg.txt
--- a/Scripts/pdf-image-extractor.py
+++ b/Scripts/pdf-image-extractor.py
@ -17,14 +17,12 @@ pdf = sys.argv[1]

 if __name__ == '__main__':
    input1 = PyPDF2.PdfFileReader(open(pdf, "rb"))
-    page0 = input1.getPage(2)
+    page0 = input1.getPage(30)

    if '/XObject' in page0['/Resources']:
        xObject = page0['/Resources']['/XObject'].getObject()

-        print(xObject)
        for obj in xObject:
-            print(xObject[obj])
            if xObject[obj]['/Subtype'] == '/Image':
                size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
                data = xObject[obj].getData()
@ -45,6 +43,10 @@ if __name__ == '__main__':
                        img = open(obj[1:] + ".jp2", "wb")
                        img.write(data)
                        img.close()
+                    elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
+                        img = open(obj[1:] + ".tiff", "wb")
+                        img.write(data)
+                        img.close()
                else:
                    img = Image.frombytes(mode, size, data)
                    img.save(obj[1:] + ".png")
--- a/Tests/tests.py
+++ b/Tests/tests.py
@ -1,6 +1,7 @@
 import os
 import sys
 import unittest
+import binascii

 from PyPDF2 import PdfFileReader, PdfFileWriter

@ -37,6 +38,28 @@ class PdfReaderTestCases(unittest.TestCase):
                msg='PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n'
                    % (pdftext, ipdf_p1_text))

+    def test_PdfReaderJpegImage(self):
+        '''
+        Test loading and parsing of a file. Extract the image of the file and compare to expected
+        textual output. Expected outcome: file loads, image matches expected.
+        '''
+
+        with open(os.path.join(RESOURCE_ROOT, 'jpeg.pdf'), 'rb') as inputfile:
+            # Load PDF file from file
+            ipdf = PdfFileReader(inputfile)
+        
+            # Retrieve the text of the image
+            with open(os.path.join(RESOURCE_ROOT, 'jpeg.txt'), 'r') as pdftext_file:
+                imagetext = pdftext_file.read()
+                
+            ipdf_p0 = ipdf.getPage(0)    
+            xObject = ipdf_p0['/Resources']['/XObject'].getObject()
+            data = xObject['/Im4'].getData()
+    
+            # Compare the text of the PDF to a known source
+            self.assertEqual(binascii.hexlify(data), imagetext, 
+                             msg='PDF extracted image differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n' 
+                             % (imagetext, binascii.hexlify(data)))

 class AddJsTestCase(unittest.TestCase):