Merge pull request #1 from maphew/master
Image extractor script with sample failing pdf
This commit is contained in:
commit
c83cbd87e7
Binary file not shown.
|
@ -0,0 +1,42 @@
|
|||
'''
|
||||
Extract images from PDF without resampling or altering.
|
||||
|
||||
Adapted from work by Sylvain Pelissier
|
||||
http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python
|
||||
'''
|
||||
|
||||
import sys
|
||||
import PyPDF2
|
||||
from PIL import Image
|
||||
|
||||
if (len(sys.argv) != 2):
|
||||
print("\nUsage: python {} input_file\n".format(sys.argv[0]))
|
||||
sys.exit(1)
|
||||
|
||||
pdf = sys.argv[1]
|
||||
|
||||
if __name__ == '__main__':
|
||||
input1 = PyPDF2.PdfFileReader(open(pdf, "rb"))
|
||||
page0 = input1.getPage(0)
|
||||
xObject = page0['/Resources']['/XObject'].getObject()
|
||||
|
||||
for obj in xObject:
|
||||
if xObject[obj]['/Subtype'] == '/Image':
|
||||
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
|
||||
data = xObject[obj].getData()
|
||||
if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
|
||||
mode = "RGB"
|
||||
else:
|
||||
mode = "P"
|
||||
|
||||
if xObject[obj]['/Filter'] == '/FlateDecode':
|
||||
img = Image.frombytes(mode, size, data)
|
||||
img.save(obj[1:] + ".png")
|
||||
elif xObject[obj]['/Filter'] == '/DCTDecode':
|
||||
img = open(obj[1:] + ".jpg", "wb")
|
||||
img.write(data)
|
||||
img.close()
|
||||
elif xObject[obj]['/Filter'] == '/JPXDecode':
|
||||
img = open(obj[1:] + ".jp2", "wb")
|
||||
img.write(data)
|
||||
img.close()
|
Loading…
Reference in New Issue