Merge branch 'sylvainpelissier-master'

2018-03-12 14:20:32 -05:00 · 2018-03-12 14:20:32 -05:00 · 461fc5e053
parent a1bfcedf80 6a578d057c
commit 461fc5e053
12 changed files with 178 additions and 8 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,7 @@
 *.pyc
 *.swp
 .DS_Store
+.tox
 build
 .idea/*

--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,21 @@
+language: python
+python: "2.7"
+sudo: false
+
+env:
+  - TOX_ENV=py27
+  - TOX_ENV=py33
+  - TOX_ENV=py34
+  - TOX_ENV=py35
+
+install:
+  - pip install tox --use-mirrors
+
+script:
+  - tox -e $TOX_ENV
+
+matrix:
+  # Python 3.5 not yet available on travis, watch this to see when it is.
+  fast_finish: true
+  allow_failures:
+    - env: TOX_ENV=py35
--- a/PDF_Samples/GeoBase_NHNC1_Data_Model_UML_EN.pdf
+++ b/PDF_Samples/GeoBase_NHNC1_Data_Model_UML_EN.pdf
--- a/PDF_Samples/Seige_of_Vicksburg_Sample_OCR.pdf
+++ b/PDF_Samples/Seige_of_Vicksburg_Sample_OCR.pdf
--- a/PDF_Samples/jpeg.pdf
+++ b/PDF_Samples/jpeg.pdf
--- a/PyPDF2/filters.py
+++ b/PyPDF2/filters.py
@ -345,10 +345,51 @@ class ASCII85Decode(object):
            return bytes(out)
    decode = staticmethod(decode)

-
+class DCTDecode(object):
+    def decode(data, decodeParms=None):
+        return data
+    decode = staticmethod(decode)
+    
+class JPXDecode(object):
+    def decode(data, decodeParms=None):
+        return data
+    decode = staticmethod(decode)
+    
+class CCITTFaxDecode(object):   
+    def decode(data, decodeParms=None, height=0):
+        if decodeParms:
+            if decodeParms.get("/K", 1) == -1:
+                CCITTgroup = 4
+            else:
+                CCITTgroup = 3
+        
+        width = decodeParms["/Columns"]
+        imgSize = len(data)
+        tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h'
+        tiffHeader = struct.pack(tiff_header_struct,
+                           b'II',  # Byte order indication: Little endian
+                           42,  # Version number (always 42)
+                           8,  # Offset to first IFD
+                           8,  # Number of tags in IFD
+                           256, 4, 1, width,  # ImageWidth, LONG, 1, width
+                           257, 4, 1, height,  # ImageLength, LONG, 1, length
+                           258, 3, 1, 1,  # BitsPerSample, SHORT, 1, 1
+                           259, 3, 1, CCITTgroup,  # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
+                           262, 3, 1, 0,  # Thresholding, SHORT, 1, 0 = WhiteIsZero
+                           273, 4, 1, struct.calcsize(tiff_header_struct),  # StripOffsets, LONG, 1, length of header
+                           278, 4, 1, height,  # RowsPerStrip, LONG, 1, length
+                           279, 4, 1, imgSize,  # StripByteCounts, LONG, 1, size of image
+                           0  # last IFD
+                           )
+        
+        return tiffHeader + data
+    
+    decode = staticmethod(decode)
+    
 def decodeStreamData(stream):
    from .generic import NameObject
    filters = stream.get("/Filter", ())
+
    if len(filters) and not isinstance(filters[0], NameObject):
        # we have a single filter instance
        filters = (filters,)
@ -364,6 +405,13 @@ def decodeStreamData(stream):
                data = LZWDecode.decode(data, stream.get("/DecodeParms"))
            elif filterType == "/ASCII85Decode" or filterType == "/A85":
                data = ASCII85Decode.decode(data)
+            elif filterType == "/DCTDecode":
+                data = DCTDecode.decode(data)
+            elif filterType == "/JPXDecode":
+                data = JPXDecode.decode(data)
+            elif filterType == "/CCITTFaxDecode":
+                height = stream.get("/Height", ())
+                data = CCITTFaxDecode.decode(data, stream.get("/DecodeParms"), height)
            elif filterType == "/Crypt":
                decodeParams = stream.get("/DecodeParams", {})
                if "/Name" not in decodeParams and "/Type" not in decodeParams:
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-#PyPDF2
+#PyPDF2 [![Build Status](https://travis-ci.org/sylvainpelissier/PyPDF2.svg)](https://travis-ci.org/sylvainpelissier/PyPDF2)

 PyPDF2 is a pure-python PDF library capable of
 splitting, merging together, cropping, and transforming
@ -31,4 +31,4 @@ Tests can be run from the command line by:

 ```bash
 python -m unittest Tests.tests
-```
+```
--- a/Resources/jpeg.pdf
+++ b/Resources/jpeg.pdf
--- a/Resources/jpeg.txt
+++ b/Resources/jpeg.txt
--- a/Scripts/pdf-image-extractor.py
+++ b/Scripts/pdf-image-extractor.py
@ -0,0 +1,54 @@
+'''
+Extract images from PDF without resampling or altering.
+
+Adapted from work by Sylvain Pelissier
+http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python
+'''
+
+import sys
+import PyPDF2
+from PIL import Image
+
+if (len(sys.argv) != 2):
+    print("\nUsage: python {} input_file\n".format(sys.argv[0]))
+    sys.exit(1)
+
+pdf = sys.argv[1]
+
+if __name__ == '__main__':
+    input1 = PyPDF2.PdfFileReader(open(pdf, "rb"))
+    page0 = input1.getPage(30)
+
+    if '/XObject' in page0['/Resources']:
+        xObject = page0['/Resources']['/XObject'].getObject()
+
+        for obj in xObject:
+            if xObject[obj]['/Subtype'] == '/Image':
+                size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
+                data = xObject[obj].getData()
+                if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
+                    mode = "RGB"
+                else:
+                    mode = "P"
+                
+                if '/Filter' in xObject[obj]:
+                    if xObject[obj]['/Filter'] == '/FlateDecode':
+                        img = Image.frombytes(mode, size, data)
+                        img.save(obj[1:] + ".png")
+                    elif xObject[obj]['/Filter'] == '/DCTDecode':
+                        img = open(obj[1:] + ".jpg", "wb")
+                        img.write(data)
+                        img.close()
+                    elif xObject[obj]['/Filter'] == '/JPXDecode':
+                        img = open(obj[1:] + ".jp2", "wb")
+                        img.write(data)
+                        img.close()
+                    elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
+                        img = open(obj[1:] + ".tiff", "wb")
+                        img.write(data)
+                        img.close()
+                else:
+                    img = Image.frombytes(mode, size, data)
+                    img.save(obj[1:] + ".png")
+    else:
+        print("No image found.")
--- a/Tests/tests.py
+++ b/Tests/tests.py
@ -1,6 +1,7 @@
 import os
 import sys
 import unittest
+import binascii

 from PyPDF2 import PdfFileReader, PdfFileWriter

@ -27,15 +28,38 @@ class PdfReaderTestCases(unittest.TestCase):
            ipdf_p1 = ipdf.getPage(0)

            # Retrieve the text of the PDF
-            pdftext_file = open(os.path.join(RESOURCE_ROOT, 'crazyones.txt'), 'r')
-            pdftext = pdftext_file.read()
-            ipdf_p1_text = ipdf_p1.extractText().replace('\n', '')
+            with open(os.path.join(RESOURCE_ROOT, 'crazyones.txt'), 'rb') as pdftext_file:
+                pdftext = pdftext_file.read()
+
+            ipdf_p1_text = ipdf_p1.extractText().replace('\n', '').encode('utf-8')

            # Compare the text of the PDF to a known source
-            self.assertEqual(ipdf_p1_text.encode('utf-8', errors='ignore'), pdftext,
+            self.assertEqual(ipdf_p1_text, pdftext,
                msg='PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n'
-                    % (pdftext, ipdf_p1_text.encode('utf-8', errors='ignore')))
+                    % (pdftext, ipdf_p1_text))

+    def test_PdfReaderJpegImage(self):
+        '''
+        Test loading and parsing of a file. Extract the image of the file and compare to expected
+        textual output. Expected outcome: file loads, image matches expected.
+        '''
+
+        with open(os.path.join(RESOURCE_ROOT, 'jpeg.pdf'), 'rb') as inputfile:
+            # Load PDF file from file
+            ipdf = PdfFileReader(inputfile)
+        
+            # Retrieve the text of the image
+            with open(os.path.join(RESOURCE_ROOT, 'jpeg.txt'), 'r') as pdftext_file:
+                imagetext = pdftext_file.read()
+                
+            ipdf_p0 = ipdf.getPage(0)    
+            xObject = ipdf_p0['/Resources']['/XObject'].getObject()
+            data = xObject['/Im4'].getData()
+    
+            # Compare the text of the PDF to a known source
+            self.assertEqual(binascii.hexlify(data).decode(), imagetext, 
+                             msg='PDF extracted image differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n' 
+                             % (imagetext, binascii.hexlify(data).decode()))

 class AddJsTestCase(unittest.TestCase):

--- a/tox.ini
+++ b/tox.ini
@ -0,0 +1,21 @@
+[tox]
+envlist =
+	py26, py27, py33, py34, py35
+
+[testenv]
+commands = python -m unittest Tests.tests
+
+[testenv:py26]
+basepython = python2.6
+
+[testenv:py27]
+basepython = python2.7
+
+[testenv:py33]
+basepython = python3.3
+
+[testenv:py34]
+basepython = python3.4
+
+[testenv:py35]
+basepython = python3.5