''' Extract images from PDF without resampling or altering. Adapted from work by Sylvain Pelissier http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python ''' import sys import PyPDF2 from PIL import Image if (len(sys.argv) != 2): print("\nUsage: python {} input_file\n".format(sys.argv[0])) sys.exit(1) pdf = sys.argv[1] if __name__ == '__main__': input1 = PyPDF2.PdfFileReader(open(pdf, "rb")) page0 = input1.getPage(2) if '/XObject' in page0['/Resources']: xObject = page0['/Resources']['/XObject'].getObject() print(xObject) for obj in xObject: print(xObject[obj]) if xObject[obj]['/Subtype'] == '/Image': size = (xObject[obj]['/Width'], xObject[obj]['/Height']) data = xObject[obj].getData() if xObject[obj]['/ColorSpace'] == '/DeviceRGB': mode = "RGB" else: mode = "P" if '/Filter' in xObject[obj]: if xObject[obj]['/Filter'] == '/FlateDecode': img = Image.frombytes(mode, size, data) img.save(obj[1:] + ".png") elif xObject[obj]['/Filter'] == '/DCTDecode': img = open(obj[1:] + ".jpg", "wb") img.write(data) img.close() elif xObject[obj]['/Filter'] == '/JPXDecode': img = open(obj[1:] + ".jp2", "wb") img.write(data) img.close() else: img = Image.frombytes(mode, size, data) img.save(obj[1:] + ".png") else: print("No image found.")