Merge pull request #1 from maphew/master

Image extractor script with sample failing pdf
2016-01-07 08:29:57 +01:00 · 2016-01-07 08:29:57 +01:00 · c83cbd87e7
parent 39de327cd9 eeb2b659aa
commit c83cbd87e7
2 changed files with 42 additions and 0 deletions
--- a/PDF_Samples/GeoBase_NHNC1_Data_Model_UML_EN.pdf
+++ b/PDF_Samples/GeoBase_NHNC1_Data_Model_UML_EN.pdf
--- a/Scripts/pdf-image-extractor.py
+++ b/Scripts/pdf-image-extractor.py
@ -0,0 +1,42 @@
+'''
+Extract images from PDF without resampling or altering.
+
+Adapted from work by Sylvain Pelissier
+http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python
+'''
+
+import sys
+import PyPDF2
+from PIL import Image
+
+if (len(sys.argv) != 2):
+    print("\nUsage: python {} input_file\n".format(sys.argv[0]))
+    sys.exit(1)
+
+pdf = sys.argv[1]
+
+if __name__ == '__main__':
+    input1 = PyPDF2.PdfFileReader(open(pdf, "rb"))
+    page0 = input1.getPage(0)
+    xObject = page0['/Resources']['/XObject'].getObject()
+
+    for obj in xObject:
+        if xObject[obj]['/Subtype'] == '/Image':
+            size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
+            data = xObject[obj].getData()
+            if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
+                mode = "RGB"
+            else:
+                mode = "P"
+
+            if xObject[obj]['/Filter'] == '/FlateDecode':
+                img = Image.frombytes(mode, size, data)
+                img.save(obj[1:] + ".png")
+            elif xObject[obj]['/Filter'] == '/DCTDecode':
+                img = open(obj[1:] + ".jpg", "wb")
+                img.write(data)
+                img.close()
+            elif xObject[obj]['/Filter'] == '/JPXDecode':
+                img = open(obj[1:] + ".jp2", "wb")
+                img.write(data)
+                img.close()