我正在尝试从 PDF 中提取图像并从StackOverflow
. 它适用于某些 pdf,但不适用于所有 pdf。我看到一个模式,它的 pdf 页面数超过 8-10,它没有提取任何东西。我想我在这里遗漏了一些东西。请帮我弄清楚。这是我正在使用的代码,这里是pdf 资源的链接
import PyPDF2
import sys
from PIL import Image
import os
import glob
from PyPDF2 import PdfFileReader
def ExtractImages(filename):
print("\n---------------------------------------")
print("This is the pdf processing",filename)
fileObject = PyPDF2.PdfFileReader(open(filename, "rb"))
print(fileObject)
pages = fileObject.getNumPages()
print("Total number of Pages is.....",pages)
for i in range(2,pages):
tempPage = fileObject.getPage(i)
if '/XObject' in tempPage['/Resources']:
xObject = tempPage['/Resources']['/XObject'].getObject()
for obj in xObject:
if xObject[obj]['/Subtype'] == '/Image':
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
data = xObject[obj].getData()
if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
mode = "RGB"
else:
mode = "P"
if '/Filter' in xObject[obj]:
if xObject[obj]['/Filter'] == '/FlateDecode':
img = Image.frombytes(mode, size, data)
img.save(obj[1:] + ".png")
elif xObject[obj]['/Filter'] == '/DCTDecode':
img = open(obj[1:] + ".jpg", "wb")
img.write(data)
img.close()
elif xObject[obj]['/Filter'] == '/JPXDecode':
img = open(obj[1:] + ".jp2", "wb")
img.write(data)
img.close()
elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
img = open(obj[1:] + ".tiff", "wb")
img.write(data)
img.close()
else:
img = Image.frombytes(mode, size, data)
img.save(obj[1:] + ".png")
else:
print("No image found for file.",filename)
listOfFiles = glob.glob('./*.pdf')
for file in listOfFiles:
ExtractImages(file)