2

我尝试检查 Windows 环境中的 PDF 文件是否损坏,并提出以下 python 代码。

只是想检查它是检查损坏的 PDF 文件的最佳方法还是有其他简单的方法?

注意C:\Temp\python\sample-map (1).pdf是损坏的PDF文件

这是示例代码

import os
import subprocess
import re
from subprocess import Popen, PIPE

def checkFile(fullfile):
    proc=subprocess.Popen(["file", "-b", fullfile], shell=True, stdout=PIPE, stderr=PIPE, bufsize=0)
    # -b, --brief : do not prepend filenames to output lines
    out, err = proc.communicate()
    exitcode = proc.returncode
    return exitcode, out, err

def searchFiles(dirpath):
    pwdpath=os.path.dirname(os.path.realpath(__file__))
    print("running path : %s" %pwdpath )
    if os.access(dirpath, os.R_OK):
        print("Path %s validation OK \n" %dirpath)
        listfiles=os.listdir(dirpath)
        for files in listfiles:
            fullfile=os.path.join(dirpath, files)
            if os.access(fullfile, os.R_OK):
                code, out, error = checkFile(fullfile)
                if str(code) !="0" or str(error, "utf-8") != "" or re.search("^(?!PDF(\s)).*", str(out,'utf-8')):
                    print("ERROR " + fullfile+"\n################")
                else:
                    print("OK " + fullfile+"\n################")
            else:
                print("$s : File not readable" %fullfile)
    else:
        print("Path is not valid")

if __name__ == "__main__":
    searchFiles('C:\Temp\python')

样本输出:

$ "C:/Program Files (x86)/Python37-32/python.exe" c:/Users/myuser/python/check_pdf_file.py
running path : c:\Users\myuser\python
Path C:\Temp\python validation OK

OK C:\Temp\python\Induction Guide.pdf
################
ERROR C:\Temp\python\sample-map (1).pdf
################
OK C:\Temp\python\sample-map.pdf
################
4

1 回答 1

2

我认为您可以使用 PyPDF2 模块。

pip install pypdf2

代码如下。

from PyPDF2 import PdfFileReader
import os

def checkFile(fullfile):
    with open(fullfile, 'rb') as f:
        try:
            pdf = PdfFileReader(f)
            info = pdf.getDocumentInfo()
            if info:
                return True
            else:
                return False
        except:
            return False

def searchFiles(dirpath):
    pwdpath = os.path.dirname(os.path.realpath(__file__))
    print("running path : %s" %pwdpath )
    if os.access(dirpath, os.R_OK):
        print("Path %s validation OK \n" %dirpath)
        listfiles = os.listdir(dirpath)
        for f in listfiles:
            fullfile = os.path.join(dirpath, f)
            if checkFile(fullfile):
                print("OK " + fullfile + "\n################")
            else:
                print("ERROR " + fullfile + "\n################")
    else:
        print("Path is not valid")

if __name__ == "__main__":
    searchFiles('C:\Temp\python')

我试图匹配你的编码风格。

我认为这段代码也可以在 MacOS 或 Linux 上使用。

于 2020-06-01T14:51:00.070 回答