debian-pypdf4/Scripts/pdf-image-extractor.py

53 lines
1.8 KiB
Python

'''
Extract images from PDF without resampling or altering.
Adapted from work by Sylvain Pelissier
http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python
'''
import sys
import PyPDF2
from PIL import Image
if (len(sys.argv) != 2):
print("\nUsage: python {} input_file\n".format(sys.argv[0]))
sys.exit(1)
pdf = sys.argv[1]
if __name__ == '__main__':
input1 = PyPDF2.PdfFileReader(open(pdf, "rb"))
page0 = input1.getPage(2)
if '/XObject' in page0['/Resources']:
xObject = page0['/Resources']['/XObject'].getObject()
print(xObject)
for obj in xObject:
print(xObject[obj])
if xObject[obj]['/Subtype'] == '/Image':
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
data = xObject[obj].getData()
if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
mode = "RGB"
else:
mode = "P"
if '/Filter' in xObject[obj]:
if xObject[obj]['/Filter'] == '/FlateDecode':
img = Image.frombytes(mode, size, data)
img.save(obj[1:] + ".png")
elif xObject[obj]['/Filter'] == '/DCTDecode':
img = open(obj[1:] + ".jpg", "wb")
img.write(data)
img.close()
elif xObject[obj]['/Filter'] == '/JPXDecode':
img = open(obj[1:] + ".jp2", "wb")
img.write(data)
img.close()
else:
img = Image.frombytes(mode, size, data)
img.save(obj[1:] + ".png")
else:
print("No image found.")