Merge branch 'sylvainpelissier-master'
This commit is contained in:
commit
461fc5e053
|
@ -1,6 +1,7 @@
|
|||
*.pyc
|
||||
*.swp
|
||||
.DS_Store
|
||||
.tox
|
||||
build
|
||||
.idea/*
|
||||
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
language: python
|
||||
python: "2.7"
|
||||
sudo: false
|
||||
|
||||
env:
|
||||
- TOX_ENV=py27
|
||||
- TOX_ENV=py33
|
||||
- TOX_ENV=py34
|
||||
- TOX_ENV=py35
|
||||
|
||||
install:
|
||||
- pip install tox --use-mirrors
|
||||
|
||||
script:
|
||||
- tox -e $TOX_ENV
|
||||
|
||||
matrix:
|
||||
# Python 3.5 not yet available on travis, watch this to see when it is.
|
||||
fast_finish: true
|
||||
allow_failures:
|
||||
- env: TOX_ENV=py35
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -345,10 +345,51 @@ class ASCII85Decode(object):
|
|||
return bytes(out)
|
||||
decode = staticmethod(decode)
|
||||
|
||||
|
||||
class DCTDecode(object):
|
||||
def decode(data, decodeParms=None):
|
||||
return data
|
||||
decode = staticmethod(decode)
|
||||
|
||||
class JPXDecode(object):
|
||||
def decode(data, decodeParms=None):
|
||||
return data
|
||||
decode = staticmethod(decode)
|
||||
|
||||
class CCITTFaxDecode(object):
|
||||
def decode(data, decodeParms=None, height=0):
|
||||
if decodeParms:
|
||||
if decodeParms.get("/K", 1) == -1:
|
||||
CCITTgroup = 4
|
||||
else:
|
||||
CCITTgroup = 3
|
||||
|
||||
width = decodeParms["/Columns"]
|
||||
imgSize = len(data)
|
||||
tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h'
|
||||
tiffHeader = struct.pack(tiff_header_struct,
|
||||
b'II', # Byte order indication: Little endian
|
||||
42, # Version number (always 42)
|
||||
8, # Offset to first IFD
|
||||
8, # Number of tags in IFD
|
||||
256, 4, 1, width, # ImageWidth, LONG, 1, width
|
||||
257, 4, 1, height, # ImageLength, LONG, 1, length
|
||||
258, 3, 1, 1, # BitsPerSample, SHORT, 1, 1
|
||||
259, 3, 1, CCITTgroup, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
|
||||
262, 3, 1, 0, # Thresholding, SHORT, 1, 0 = WhiteIsZero
|
||||
273, 4, 1, struct.calcsize(tiff_header_struct), # StripOffsets, LONG, 1, length of header
|
||||
278, 4, 1, height, # RowsPerStrip, LONG, 1, length
|
||||
279, 4, 1, imgSize, # StripByteCounts, LONG, 1, size of image
|
||||
0 # last IFD
|
||||
)
|
||||
|
||||
return tiffHeader + data
|
||||
|
||||
decode = staticmethod(decode)
|
||||
|
||||
def decodeStreamData(stream):
|
||||
from .generic import NameObject
|
||||
filters = stream.get("/Filter", ())
|
||||
|
||||
if len(filters) and not isinstance(filters[0], NameObject):
|
||||
# we have a single filter instance
|
||||
filters = (filters,)
|
||||
|
@ -364,6 +405,13 @@ def decodeStreamData(stream):
|
|||
data = LZWDecode.decode(data, stream.get("/DecodeParms"))
|
||||
elif filterType == "/ASCII85Decode" or filterType == "/A85":
|
||||
data = ASCII85Decode.decode(data)
|
||||
elif filterType == "/DCTDecode":
|
||||
data = DCTDecode.decode(data)
|
||||
elif filterType == "/JPXDecode":
|
||||
data = JPXDecode.decode(data)
|
||||
elif filterType == "/CCITTFaxDecode":
|
||||
height = stream.get("/Height", ())
|
||||
data = CCITTFaxDecode.decode(data, stream.get("/DecodeParms"), height)
|
||||
elif filterType == "/Crypt":
|
||||
decodeParams = stream.get("/DecodeParams", {})
|
||||
if "/Name" not in decodeParams and "/Type" not in decodeParams:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#PyPDF2
|
||||
#PyPDF2 [![Build Status](https://travis-ci.org/sylvainpelissier/PyPDF2.svg)](https://travis-ci.org/sylvainpelissier/PyPDF2)
|
||||
|
||||
PyPDF2 is a pure-python PDF library capable of
|
||||
splitting, merging together, cropping, and transforming
|
||||
|
@ -31,4 +31,4 @@ Tests can be run from the command line by:
|
|||
|
||||
```bash
|
||||
python -m unittest Tests.tests
|
||||
```
|
||||
```
|
||||
|
|
Binary file not shown.
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,54 @@
|
|||
'''
|
||||
Extract images from PDF without resampling or altering.
|
||||
|
||||
Adapted from work by Sylvain Pelissier
|
||||
http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python
|
||||
'''
|
||||
|
||||
import sys
|
||||
import PyPDF2
|
||||
from PIL import Image
|
||||
|
||||
if (len(sys.argv) != 2):
|
||||
print("\nUsage: python {} input_file\n".format(sys.argv[0]))
|
||||
sys.exit(1)
|
||||
|
||||
pdf = sys.argv[1]
|
||||
|
||||
if __name__ == '__main__':
|
||||
input1 = PyPDF2.PdfFileReader(open(pdf, "rb"))
|
||||
page0 = input1.getPage(30)
|
||||
|
||||
if '/XObject' in page0['/Resources']:
|
||||
xObject = page0['/Resources']['/XObject'].getObject()
|
||||
|
||||
for obj in xObject:
|
||||
if xObject[obj]['/Subtype'] == '/Image':
|
||||
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
|
||||
data = xObject[obj].getData()
|
||||
if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
|
||||
mode = "RGB"
|
||||
else:
|
||||
mode = "P"
|
||||
|
||||
if '/Filter' in xObject[obj]:
|
||||
if xObject[obj]['/Filter'] == '/FlateDecode':
|
||||
img = Image.frombytes(mode, size, data)
|
||||
img.save(obj[1:] + ".png")
|
||||
elif xObject[obj]['/Filter'] == '/DCTDecode':
|
||||
img = open(obj[1:] + ".jpg", "wb")
|
||||
img.write(data)
|
||||
img.close()
|
||||
elif xObject[obj]['/Filter'] == '/JPXDecode':
|
||||
img = open(obj[1:] + ".jp2", "wb")
|
||||
img.write(data)
|
||||
img.close()
|
||||
elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
|
||||
img = open(obj[1:] + ".tiff", "wb")
|
||||
img.write(data)
|
||||
img.close()
|
||||
else:
|
||||
img = Image.frombytes(mode, size, data)
|
||||
img.save(obj[1:] + ".png")
|
||||
else:
|
||||
print("No image found.")
|
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
import sys
|
||||
import unittest
|
||||
import binascii
|
||||
|
||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||
|
||||
|
@ -27,15 +28,38 @@ class PdfReaderTestCases(unittest.TestCase):
|
|||
ipdf_p1 = ipdf.getPage(0)
|
||||
|
||||
# Retrieve the text of the PDF
|
||||
pdftext_file = open(os.path.join(RESOURCE_ROOT, 'crazyones.txt'), 'r')
|
||||
pdftext = pdftext_file.read()
|
||||
ipdf_p1_text = ipdf_p1.extractText().replace('\n', '')
|
||||
with open(os.path.join(RESOURCE_ROOT, 'crazyones.txt'), 'rb') as pdftext_file:
|
||||
pdftext = pdftext_file.read()
|
||||
|
||||
ipdf_p1_text = ipdf_p1.extractText().replace('\n', '').encode('utf-8')
|
||||
|
||||
# Compare the text of the PDF to a known source
|
||||
self.assertEqual(ipdf_p1_text.encode('utf-8', errors='ignore'), pdftext,
|
||||
self.assertEqual(ipdf_p1_text, pdftext,
|
||||
msg='PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n'
|
||||
% (pdftext, ipdf_p1_text.encode('utf-8', errors='ignore')))
|
||||
% (pdftext, ipdf_p1_text))
|
||||
|
||||
def test_PdfReaderJpegImage(self):
|
||||
'''
|
||||
Test loading and parsing of a file. Extract the image of the file and compare to expected
|
||||
textual output. Expected outcome: file loads, image matches expected.
|
||||
'''
|
||||
|
||||
with open(os.path.join(RESOURCE_ROOT, 'jpeg.pdf'), 'rb') as inputfile:
|
||||
# Load PDF file from file
|
||||
ipdf = PdfFileReader(inputfile)
|
||||
|
||||
# Retrieve the text of the image
|
||||
with open(os.path.join(RESOURCE_ROOT, 'jpeg.txt'), 'r') as pdftext_file:
|
||||
imagetext = pdftext_file.read()
|
||||
|
||||
ipdf_p0 = ipdf.getPage(0)
|
||||
xObject = ipdf_p0['/Resources']['/XObject'].getObject()
|
||||
data = xObject['/Im4'].getData()
|
||||
|
||||
# Compare the text of the PDF to a known source
|
||||
self.assertEqual(binascii.hexlify(data).decode(), imagetext,
|
||||
msg='PDF extracted image differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n'
|
||||
% (imagetext, binascii.hexlify(data).decode()))
|
||||
|
||||
class AddJsTestCase(unittest.TestCase):
|
||||
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
[tox]
|
||||
envlist =
|
||||
py26, py27, py33, py34, py35
|
||||
|
||||
[testenv]
|
||||
commands = python -m unittest Tests.tests
|
||||
|
||||
[testenv:py26]
|
||||
basepython = python2.6
|
||||
|
||||
[testenv:py27]
|
||||
basepython = python2.7
|
||||
|
||||
[testenv:py33]
|
||||
basepython = python3.3
|
||||
|
||||
[testenv:py34]
|
||||
basepython = python3.4
|
||||
|
||||
[testenv:py35]
|
||||
basepython = python3.5
|
Loading…
Reference in New Issue