Merge pull request #1 from maphew/master

Image extractor script with sample failing pdf
This commit is contained in:
Sylvain Pelissier 2016-01-07 08:29:57 +01:00
commit c83cbd87e7
2 changed files with 42 additions and 0 deletions

Binary file not shown.

View File

@ -0,0 +1,42 @@
'''
Extract images from PDF without resampling or altering.
Adapted from work by Sylvain Pelissier
http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python
'''
import sys
import PyPDF2
from PIL import Image
if (len(sys.argv) != 2):
print("\nUsage: python {} input_file\n".format(sys.argv[0]))
sys.exit(1)
pdf = sys.argv[1]
if __name__ == '__main__':
input1 = PyPDF2.PdfFileReader(open(pdf, "rb"))
page0 = input1.getPage(0)
xObject = page0['/Resources']['/XObject'].getObject()
for obj in xObject:
if xObject[obj]['/Subtype'] == '/Image':
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
data = xObject[obj].getData()
if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
mode = "RGB"
else:
mode = "P"
if xObject[obj]['/Filter'] == '/FlateDecode':
img = Image.frombytes(mode, size, data)
img.save(obj[1:] + ".png")
elif xObject[obj]['/Filter'] == '/DCTDecode':
img = open(obj[1:] + ".jpg", "wb")
img.write(data)
img.close()
elif xObject[obj]['/Filter'] == '/JPXDecode':
img = open(obj[1:] + ".jp2", "wb")
img.write(data)
img.close()