diff --git a/PDF_Samples/Seige of Vicksburg Sample OCR.pdf b/PDF_Samples/Seige of Vicksburg Sample OCR.pdf new file mode 100644 index 0000000..234f39b Binary files /dev/null and b/PDF_Samples/Seige of Vicksburg Sample OCR.pdf differ diff --git a/Scripts/pdf-image-extractor.py b/Scripts/pdf-image-extractor.py index 8052d19..4bf2114 100644 --- a/Scripts/pdf-image-extractor.py +++ b/Scripts/pdf-image-extractor.py @@ -17,26 +17,36 @@ pdf = sys.argv[1] if __name__ == '__main__': input1 = PyPDF2.PdfFileReader(open(pdf, "rb")) - page0 = input1.getPage(0) - xObject = page0['/Resources']['/XObject'].getObject() + page0 = input1.getPage(2) - for obj in xObject: - if xObject[obj]['/Subtype'] == '/Image': - size = (xObject[obj]['/Width'], xObject[obj]['/Height']) - data = xObject[obj].getData() - if xObject[obj]['/ColorSpace'] == '/DeviceRGB': - mode = "RGB" - else: - mode = "P" + if '/XObject' in page0['/Resources']: + xObject = page0['/Resources']['/XObject'].getObject() - if xObject[obj]['/Filter'] == '/FlateDecode': - img = Image.frombytes(mode, size, data) - img.save(obj[1:] + ".png") - elif xObject[obj]['/Filter'] == '/DCTDecode': - img = open(obj[1:] + ".jpg", "wb") - img.write(data) - img.close() - elif xObject[obj]['/Filter'] == '/JPXDecode': - img = open(obj[1:] + ".jp2", "wb") - img.write(data) - img.close() + print(xObject) + for obj in xObject: + print(xObject[obj]) + if xObject[obj]['/Subtype'] == '/Image': + size = (xObject[obj]['/Width'], xObject[obj]['/Height']) + data = xObject[obj].getData() + if xObject[obj]['/ColorSpace'] == '/DeviceRGB': + mode = "RGB" + else: + mode = "P" + + if '/Filter' in xObject[obj]: + if xObject[obj]['/Filter'] == '/FlateDecode': + img = Image.frombytes(mode, size, data) + img.save(obj[1:] + ".png") + elif xObject[obj]['/Filter'] == '/DCTDecode': + img = open(obj[1:] + ".jpg", "wb") + img.write(data) + img.close() + elif xObject[obj]['/Filter'] == '/JPXDecode': + img = open(obj[1:] + ".jp2", "wb") + img.write(data) + img.close() + else: + img = Image.frombytes(mode, size, data) + img.save(obj[1:] + ".png") + else: + print("No image found.")