python-3.x - 为什么我不能从这个 pdf 中正确提取图像？[请需要帮助]

Question

我目前正在对 pdf 文件进行 OCR。这是我的管道：

我首先从 pdf 中提取图像（因为我的 pdf 包含扫描文档）并转换为 numpy 数组
然后我用 tesseract 阅读

它适用于我的大部分图像，但我有几个我无法提取其中的图像。我只是举了一个例子，但我找不到（见下）包含书写部分（用于 OCR）的扫描图像。它让我发疯（它去哪里了？？）。

也许您可以帮助我检索该图像并理解为什么我的方式不让我检索该图像“fantôme”？

注意：我注意到 pdf 中的那些有问题的图像是“jpx”格式。

编辑：由于图像在 pdf 中找不到，我尝试了一个可怕的技巧（等待聪明的解释 :)）：在 pix 中转换整个 pdf 页面（PyMuPdf 让我们这样做），然后以不同的格式（PNG，TIFF）将 PIX 写入磁盘）。与原始 pdf 相比，质量下降太多（因此我们可以忘记使用 Tesseract 进行合理阅读）。

这是pdf示例文件（如果您有更简单的托管方式，我很好奇）：https ://www.filehosting.org/file/details/906817/IB00058815877D0000000.pdf

这是我从文件中提取的两张图片（第二张应该包含 txt 而不是垃圾）

这是我提取图像的代码：

import fitz
import os
import logging
import cv2
from PIL import Image
from .utils import lazyproperty,showpdf
from .imhelpers import show
from ..config import myconfig
from impocr import logger
import pytesseract

pytesseract.pytesseract.tesseract_cmd = myconfig.TESSERACT_CMD

class InvalidImage(Exception):
     pass


class PDFParser():
    """

    """
    def __init__(self,filepath,page_num=0):
        self.filepath = filepath
        self.filename = os.path.basename(self.filepath).split('.pdf')[0]
        try:
            self._doc = fitz.open(filepath)
            self.page_num = page_num
            self._page = self._doc[page_num]
        except Exception as e: 
            print("Lecture PDF impossible. {}".format(e))
            raise
            
    @lazyproperty
    def text(self):
        return self._page.getText()
    

    @lazyproperty
    def _pixs(self):
        imgs = self._doc.getPageImageList(self.page_num)
        pixs =[]
        for img in imgs:
            xref = img[0]
            pix = fitz.Pixmap(self._doc, xref)
            pixs.append(pix)
        return pixs

    @lazyproperty
    def _pixpage(self):
        pix = self._page.getPixmap(colorspace=fitz.csGRAY)
        return pix
    
    @property   
    def img(self):
        return self.imgs[0]

    @property
    def pageimg(self):
        pix = self._pixpage
        return self.pix2np(pix)

    @lazyproperty
    def imgs(self):
        pixs = self._pixs
        imgsarray = []
        for pix in pixs:
            img = self.pix2np(pix)
            imgsarray.append(img)
        return imgsarray

    def find_first_valid_image(self):
        img_valid = None
        for i,img in enumerate(self.imgs):
            try:
                import ipdb;ipdb.set_trace()
                res = pytesseract.image_to_osd(img)
                img_valid = img
                return img_valid
            except pytesseract.TesseractError:
                continue
        if  img_valid==None:
            logger.warning('No readable image in page {} of the document {}'.format(self.page_num, self.filename))
            raise InvalidImage('No readable image in page {} of the document {}'.format(self.page_num, self.filename))


    def write(self,outputdir,fullpage=False):
        try:
            os.makedirs(outputdir)
            logger.info("Directory {} is created".format(outputdir))
        except FileExistsError:
            pass
        def _writepix(pix,filepath):
            # This is GRAY or RGB
            try:       
                pix.writePNG(filepath)
            # CMYK: convert to RGB first
            except:               
                pix = fitz.Pixmap(fitz.csRGB, pix)
                pix.writePNG(filepath)
                pix = None

        if fullpage:
            filepath = os.path.join(outputdir,'{}_p{}.png'.format(self.filename,self.page_num))
            pix = self._pixpage
            _writepix(pix,filepath)
            return
        pixs = self._pixs
        for i,pix in enumerate(pixs):
            filepath = os.path.join(outputdir,'{}_p{}_i{}.png'.format(self.filename,self.page_num,i))
            _writepix(pix,filepath)
        return



    def pix2np(self,pix):
        """
        Convert pixmap to image np.ndarray
        https://stackoverflow.com/questions/53059007/python-opencv
        param pix: pixmap
        """
        import numpy as np
        #https://stackoverflow.com/questions/22236749/numpy-what-is-the-difference-between-frombuffer-and-fromstring
        im = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
        try:
            im = np.ascontiguousarray(im[..., [2, 1, 0]])  # rgb to bgr
        except IndexError:
            #Trick to convert Gray rto BGR, (im.reshape)
            #logger.warning("Need to convert Gray to BGR [filepath: {}]".format(self.filepath)) 
            im = cv2.cvtColor(im,cv2.COLOR_GRAY2RGB)
            im = np.ascontiguousarray(im[..., [2, 1, 0]])
        return im



        
if __name__ == "__main__":
    filepath = r'data\inputs\test\impot_textpdf_with_one_logoimage.pdf'
    ###### Parse page 0 (first page) ######
    pdf = PDFParser(filepath,0)
    text = pdf.text
    imgs = pdf.imgs
    show(pdf.imgs[0])
    show(pdf.imgs[1])

############### other functions ####################
class lazyproperty:
    def __init__(self, func):
        self.func = func
    def __get__(self, instance, cls):
        if instance is None:
            return self
        else:
            value = self.func(instance)
            setattr(instance, self.func.__name__, value)
            return value

def show(image):
    import matplotlib.pyplot as plt
    fig,ax = plt.subplots(1)
    ax.imshow(image)
    plt.show()

score 1 · Accepted Answer

我的解决方案不是很好（但在等待更好的想法，但这是我的 2 美分想法：我首先编写整页并使用 opencv 阅读（我更改了 first_valid_image 方法，如附件所示）。

from tmpfile import TemporaryDirectory
def find_first_valid_image(self):
        #import ipdb;ipdb.set_trace()
        img_valid = None
        for i,img in enumerate(self.imgs):
            try:
                #import ipdb;ipdb.set_trace()
                res = pytesseract.image_to_osd(img)
                img_valid = img
                return img_valid
            except pytesseract.TesseractError:
                continue
        if  img_valid==None:
            logger.warning('No readable image in page {} of the document {}. Tried the fullpage.'.format(self.page_num, self.filename))
            with TemporaryDirectory() as tmpdirname:
                filepath = self.write(tmpdirname,fullpage=True)
                img_fullpage =cv2.imread(filepath)
            return img_fullpage

我认为它会降低我原始图像的质量；因此，当在图像上应用 tesseract 时，我得到了一个错误的 ocr，如您所见。

"""DIRECTION GÉNÉRALE DE6 FNANCES PUBLIQUES\n\nAVIS D'IMPÔT 2017\nIMPÔT SUR LES REVENUS\nd Fannée 2016\n\n \n\nPour vos _ démarches,\npas  besoin doiginal —\nMc d furir un —\nphotocopie, vérifiable sur —\nTmpots gouv vn\n\nVotre situation\n\n \n\nVos rétérences.\n\nPour accéder à votre espace partculior MONTANT À PAYER\nNuméro fiscal | | A us ario 15/00/2017 (41)\n\nN* daccès en ligne voirvouo déciaration | | Détail du montant à payer\nRevenu fiscal d référence Montart de vtr impôt su e revors\n# | Rétéronce de 'avis <VRRRRS | Versemens sur 1er acompte\nVersomontssur 26 acompto\n\nNuméro F —\n\nNuméro de rôle 016 A\nDate c'étaissement 2m0762017|\nDate de mise en recouvrement 3vo7æ2017|\n\n \n\n \n\n \n\n3899,00 €\n3893006\n\n \n\n \n\nLa somme que vous davez payer est supérieure à 2 000 €\nLa loirend obligatoie le paiement de cette somme par un des moyens suivants, à votre choix :\n\nur impots.gouv.fr: payez en igne ou adhérez au prélèvement à léchéance en vous connectant à vore\nspaco pariclor, pislissoz-vous guider\n\npartéléphone, courrir où couriel pour adhérer au prélèvement à échéanco (aux coordonnéesindiquées\ndansle cadre - Vos démarches »\n\nPour 2018,vous pourrez achérerau prélèvement mensue\n\x0c"""

python-3.x - 为什么我不能从这个 pdf 中正确提取图像？[请需要帮助]

1 回答 1

Related

Reference