0

我正在尝试从 PDF 中提取图像并从StackOverflow. 它适用于某些 pdf,但不适用于所有 pdf。我看到一个模式,它的 pdf 页面数超过 8-10,它没有提取任何东西。我想我在这里遗漏了一些东西。请帮我弄清楚。这是我正在使用的代码,这里是pdf 资源的链接

import PyPDF2
import sys
from PIL import Image
import os
import glob
from PyPDF2 import PdfFileReader
def ExtractImages(filename):
    print("\n---------------------------------------")
    print("This is the pdf processing",filename)

    fileObject = PyPDF2.PdfFileReader(open(filename, "rb"))
    print(fileObject)
    pages = fileObject.getNumPages()
    print("Total number of Pages is.....",pages)
    for i in range(2,pages):
        tempPage = fileObject.getPage(i)
        if '/XObject' in tempPage['/Resources']:
            xObject = tempPage['/Resources']['/XObject'].getObject()
            for obj in xObject:
                if xObject[obj]['/Subtype'] == '/Image':
                    size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
                    data = xObject[obj].getData()
                    if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
                        mode = "RGB"
                    else:
                        mode = "P"
                    if '/Filter' in xObject[obj]:
                        if xObject[obj]['/Filter'] == '/FlateDecode':

                            img = Image.frombytes(mode, size, data)
                            img.save(obj[1:] + ".png")
                        elif xObject[obj]['/Filter'] == '/DCTDecode':
                            img = open(obj[1:] + ".jpg", "wb")
                            img.write(data)
                            img.close()
                        elif xObject[obj]['/Filter'] == '/JPXDecode':
                            img = open(obj[1:] + ".jp2", "wb")
                            img.write(data)
                            img.close()
                        elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
                            img = open(obj[1:] + ".tiff", "wb")
                            img.write(data)
                            img.close()
                    else:
                        img = Image.frombytes(mode, size, data)
                        img.save(obj[1:] + ".png")
        else:
            print("No image found for file.",filename)

listOfFiles = glob.glob('./*.pdf')
for file in listOfFiles:
    ExtractImages(file)
4

1 回答 1

0

Ubuntu 16.04 - amd64:这里没有错误。

sudo apt install libpoppler-dev libleptonica-dev

git clone https://github.com/allenai/pdffigures.git
cd pdffigures/
make              // The executable 'pdffigures' gets created.
./pdffigures
于 2017-11-22T15:18:27.160 回答