Merge branch 'sylvainpelissier-master'

This commit is contained in:
Matthew Stamy 2018-03-12 14:20:32 -05:00
commit 461fc5e053
12 changed files with 178 additions and 8 deletions

1
.gitignore vendored
View File

@ -1,6 +1,7 @@
*.pyc
*.swp
.DS_Store
.tox
build
.idea/*

21
.travis.yml Normal file
View File

@ -0,0 +1,21 @@
language: python
python: "2.7"
sudo: false
env:
- TOX_ENV=py27
- TOX_ENV=py33
- TOX_ENV=py34
- TOX_ENV=py35
install:
- pip install tox --use-mirrors
script:
- tox -e $TOX_ENV
matrix:
# Python 3.5 not yet available on travis, watch this to see when it is.
fast_finish: true
allow_failures:
- env: TOX_ENV=py35

Binary file not shown.

Binary file not shown.

BIN
PDF_Samples/jpeg.pdf Normal file

Binary file not shown.

View File

@ -345,10 +345,51 @@ class ASCII85Decode(object):
return bytes(out)
decode = staticmethod(decode)
class DCTDecode(object):
def decode(data, decodeParms=None):
return data
decode = staticmethod(decode)
class JPXDecode(object):
def decode(data, decodeParms=None):
return data
decode = staticmethod(decode)
class CCITTFaxDecode(object):
def decode(data, decodeParms=None, height=0):
if decodeParms:
if decodeParms.get("/K", 1) == -1:
CCITTgroup = 4
else:
CCITTgroup = 3
width = decodeParms["/Columns"]
imgSize = len(data)
tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h'
tiffHeader = struct.pack(tiff_header_struct,
b'II', # Byte order indication: Little endian
42, # Version number (always 42)
8, # Offset to first IFD
8, # Number of tags in IFD
256, 4, 1, width, # ImageWidth, LONG, 1, width
257, 4, 1, height, # ImageLength, LONG, 1, length
258, 3, 1, 1, # BitsPerSample, SHORT, 1, 1
259, 3, 1, CCITTgroup, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
262, 3, 1, 0, # Thresholding, SHORT, 1, 0 = WhiteIsZero
273, 4, 1, struct.calcsize(tiff_header_struct), # StripOffsets, LONG, 1, length of header
278, 4, 1, height, # RowsPerStrip, LONG, 1, length
279, 4, 1, imgSize, # StripByteCounts, LONG, 1, size of image
0 # last IFD
)
return tiffHeader + data
decode = staticmethod(decode)
def decodeStreamData(stream):
from .generic import NameObject
filters = stream.get("/Filter", ())
if len(filters) and not isinstance(filters[0], NameObject):
# we have a single filter instance
filters = (filters,)
@ -364,6 +405,13 @@ def decodeStreamData(stream):
data = LZWDecode.decode(data, stream.get("/DecodeParms"))
elif filterType == "/ASCII85Decode" or filterType == "/A85":
data = ASCII85Decode.decode(data)
elif filterType == "/DCTDecode":
data = DCTDecode.decode(data)
elif filterType == "/JPXDecode":
data = JPXDecode.decode(data)
elif filterType == "/CCITTFaxDecode":
height = stream.get("/Height", ())
data = CCITTFaxDecode.decode(data, stream.get("/DecodeParms"), height)
elif filterType == "/Crypt":
decodeParams = stream.get("/DecodeParams", {})
if "/Name" not in decodeParams and "/Type" not in decodeParams:

View File

@ -1,4 +1,4 @@
#PyPDF2
#PyPDF2 [![Build Status](https://travis-ci.org/sylvainpelissier/PyPDF2.svg)](https://travis-ci.org/sylvainpelissier/PyPDF2)
PyPDF2 is a pure-python PDF library capable of
splitting, merging together, cropping, and transforming
@ -31,4 +31,4 @@ Tests can be run from the command line by:
```bash
python -m unittest Tests.tests
```
```

BIN
Resources/jpeg.pdf Normal file

Binary file not shown.

1
Resources/jpeg.txt Normal file

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,54 @@
'''
Extract images from PDF without resampling or altering.
Adapted from work by Sylvain Pelissier
http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python
'''
import sys
import PyPDF2
from PIL import Image
if (len(sys.argv) != 2):
print("\nUsage: python {} input_file\n".format(sys.argv[0]))
sys.exit(1)
pdf = sys.argv[1]
if __name__ == '__main__':
input1 = PyPDF2.PdfFileReader(open(pdf, "rb"))
page0 = input1.getPage(30)
if '/XObject' in page0['/Resources']:
xObject = page0['/Resources']['/XObject'].getObject()
for obj in xObject:
if xObject[obj]['/Subtype'] == '/Image':
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
data = xObject[obj].getData()
if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
mode = "RGB"
else:
mode = "P"
if '/Filter' in xObject[obj]:
if xObject[obj]['/Filter'] == '/FlateDecode':
img = Image.frombytes(mode, size, data)
img.save(obj[1:] + ".png")
elif xObject[obj]['/Filter'] == '/DCTDecode':
img = open(obj[1:] + ".jpg", "wb")
img.write(data)
img.close()
elif xObject[obj]['/Filter'] == '/JPXDecode':
img = open(obj[1:] + ".jp2", "wb")
img.write(data)
img.close()
elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
img = open(obj[1:] + ".tiff", "wb")
img.write(data)
img.close()
else:
img = Image.frombytes(mode, size, data)
img.save(obj[1:] + ".png")
else:
print("No image found.")

View File

@ -1,6 +1,7 @@
import os
import sys
import unittest
import binascii
from PyPDF2 import PdfFileReader, PdfFileWriter
@ -27,15 +28,38 @@ class PdfReaderTestCases(unittest.TestCase):
ipdf_p1 = ipdf.getPage(0)
# Retrieve the text of the PDF
pdftext_file = open(os.path.join(RESOURCE_ROOT, 'crazyones.txt'), 'r')
pdftext = pdftext_file.read()
ipdf_p1_text = ipdf_p1.extractText().replace('\n', '')
with open(os.path.join(RESOURCE_ROOT, 'crazyones.txt'), 'rb') as pdftext_file:
pdftext = pdftext_file.read()
ipdf_p1_text = ipdf_p1.extractText().replace('\n', '').encode('utf-8')
# Compare the text of the PDF to a known source
self.assertEqual(ipdf_p1_text.encode('utf-8', errors='ignore'), pdftext,
self.assertEqual(ipdf_p1_text, pdftext,
msg='PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n'
% (pdftext, ipdf_p1_text.encode('utf-8', errors='ignore')))
% (pdftext, ipdf_p1_text))
def test_PdfReaderJpegImage(self):
'''
Test loading and parsing of a file. Extract the image of the file and compare to expected
textual output. Expected outcome: file loads, image matches expected.
'''
with open(os.path.join(RESOURCE_ROOT, 'jpeg.pdf'), 'rb') as inputfile:
# Load PDF file from file
ipdf = PdfFileReader(inputfile)
# Retrieve the text of the image
with open(os.path.join(RESOURCE_ROOT, 'jpeg.txt'), 'r') as pdftext_file:
imagetext = pdftext_file.read()
ipdf_p0 = ipdf.getPage(0)
xObject = ipdf_p0['/Resources']['/XObject'].getObject()
data = xObject['/Im4'].getData()
# Compare the text of the PDF to a known source
self.assertEqual(binascii.hexlify(data).decode(), imagetext,
msg='PDF extracted image differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n'
% (imagetext, binascii.hexlify(data).decode()))
class AddJsTestCase(unittest.TestCase):

21
tox.ini Normal file
View File

@ -0,0 +1,21 @@
[tox]
envlist =
py26, py27, py33, py34, py35
[testenv]
commands = python -m unittest Tests.tests
[testenv:py26]
basepython = python2.6
[testenv:py27]
basepython = python2.7
[testenv:py33]
basepython = python3.3
[testenv:py34]
basepython = python3.4
[testenv:py35]
basepython = python3.5