diff --git a/Scripts/pdf-image-extractor.py b/Scripts/pdf-image-extractor.py new file mode 100644 index 0000000..8052d19 --- /dev/null +++ b/Scripts/pdf-image-extractor.py @@ -0,0 +1,42 @@ +''' +Extract images from PDF without resampling or altering. + +Adapted from work by Sylvain Pelissier +http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python +''' + +import sys +import PyPDF2 +from PIL import Image + +if (len(sys.argv) != 2): + print("\nUsage: python {} input_file\n".format(sys.argv[0])) + sys.exit(1) + +pdf = sys.argv[1] + +if __name__ == '__main__': + input1 = PyPDF2.PdfFileReader(open(pdf, "rb")) + page0 = input1.getPage(0) + xObject = page0['/Resources']['/XObject'].getObject() + + for obj in xObject: + if xObject[obj]['/Subtype'] == '/Image': + size = (xObject[obj]['/Width'], xObject[obj]['/Height']) + data = xObject[obj].getData() + if xObject[obj]['/ColorSpace'] == '/DeviceRGB': + mode = "RGB" + else: + mode = "P" + + if xObject[obj]['/Filter'] == '/FlateDecode': + img = Image.frombytes(mode, size, data) + img.save(obj[1:] + ".png") + elif xObject[obj]['/Filter'] == '/DCTDecode': + img = open(obj[1:] + ".jpg", "wb") + img.write(data) + img.close() + elif xObject[obj]['/Filter'] == '/JPXDecode': + img = open(obj[1:] + ".jp2", "wb") + img.write(data) + img.close()