我目前正在对 pdf 文件进行 OCR。这是我的管道:
- 我首先从 pdf 中提取图像(因为我的 pdf 包含扫描文档)并转换为 numpy 数组
- 然后我用 tesseract 阅读
它适用于我的大部分图像,但我有几个我无法提取其中的图像。我只是举了一个例子,但我找不到(见下)包含书写部分(用于 OCR)的扫描图像。它让我发疯(它去哪里了??)。
也许您可以帮助我检索该图像并理解为什么我的方式不让我检索该图像“fantôme”?
注意:我注意到 pdf 中的那些有问题的图像是“jpx”格式。
编辑:由于图像在 pdf 中找不到,我尝试了一个可怕的技巧(等待聪明的解释 :)):在 pix 中转换整个 pdf 页面(PyMuPdf 让我们这样做),然后以不同的格式(PNG,TIFF)将 PIX 写入磁盘)。与原始 pdf 相比,质量下降太多(因此我们可以忘记使用 Tesseract 进行合理阅读)。
这是pdf示例文件(如果您有更简单的托管方式,我很好奇):https ://www.filehosting.org/file/details/906817/IB00058815877D0000000.pdf
这是我从文件中提取的两张图片(第二张应该包含 txt 而不是垃圾)
这是我提取图像的代码:
import fitz
import os
import logging
import cv2
from PIL import Image
from .utils import lazyproperty,showpdf
from .imhelpers import show
from ..config import myconfig
from impocr import logger
import pytesseract
pytesseract.pytesseract.tesseract_cmd = myconfig.TESSERACT_CMD
class InvalidImage(Exception):
pass
class PDFParser():
"""
"""
def __init__(self,filepath,page_num=0):
self.filepath = filepath
self.filename = os.path.basename(self.filepath).split('.pdf')[0]
try:
self._doc = fitz.open(filepath)
self.page_num = page_num
self._page = self._doc[page_num]
except Exception as e:
print("Lecture PDF impossible. {}".format(e))
raise
@lazyproperty
def text(self):
return self._page.getText()
@lazyproperty
def _pixs(self):
imgs = self._doc.getPageImageList(self.page_num)
pixs =[]
for img in imgs:
xref = img[0]
pix = fitz.Pixmap(self._doc, xref)
pixs.append(pix)
return pixs
@lazyproperty
def _pixpage(self):
pix = self._page.getPixmap(colorspace=fitz.csGRAY)
return pix
@property
def img(self):
return self.imgs[0]
@property
def pageimg(self):
pix = self._pixpage
return self.pix2np(pix)
@lazyproperty
def imgs(self):
pixs = self._pixs
imgsarray = []
for pix in pixs:
img = self.pix2np(pix)
imgsarray.append(img)
return imgsarray
def find_first_valid_image(self):
img_valid = None
for i,img in enumerate(self.imgs):
try:
import ipdb;ipdb.set_trace()
res = pytesseract.image_to_osd(img)
img_valid = img
return img_valid
except pytesseract.TesseractError:
continue
if img_valid==None:
logger.warning('No readable image in page {} of the document {}'.format(self.page_num, self.filename))
raise InvalidImage('No readable image in page {} of the document {}'.format(self.page_num, self.filename))
def write(self,outputdir,fullpage=False):
try:
os.makedirs(outputdir)
logger.info("Directory {} is created".format(outputdir))
except FileExistsError:
pass
def _writepix(pix,filepath):
# This is GRAY or RGB
try:
pix.writePNG(filepath)
# CMYK: convert to RGB first
except:
pix = fitz.Pixmap(fitz.csRGB, pix)
pix.writePNG(filepath)
pix = None
if fullpage:
filepath = os.path.join(outputdir,'{}_p{}.png'.format(self.filename,self.page_num))
pix = self._pixpage
_writepix(pix,filepath)
return
pixs = self._pixs
for i,pix in enumerate(pixs):
filepath = os.path.join(outputdir,'{}_p{}_i{}.png'.format(self.filename,self.page_num,i))
_writepix(pix,filepath)
return
def pix2np(self,pix):
"""
Convert pixmap to image np.ndarray
https://stackoverflow.com/questions/53059007/python-opencv
param pix: pixmap
"""
import numpy as np
#https://stackoverflow.com/questions/22236749/numpy-what-is-the-difference-between-frombuffer-and-fromstring
im = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
try:
im = np.ascontiguousarray(im[..., [2, 1, 0]]) # rgb to bgr
except IndexError:
#Trick to convert Gray rto BGR, (im.reshape)
#logger.warning("Need to convert Gray to BGR [filepath: {}]".format(self.filepath))
im = cv2.cvtColor(im,cv2.COLOR_GRAY2RGB)
im = np.ascontiguousarray(im[..., [2, 1, 0]])
return im
if __name__ == "__main__":
filepath = r'data\inputs\test\impot_textpdf_with_one_logoimage.pdf'
###### Parse page 0 (first page) ######
pdf = PDFParser(filepath,0)
text = pdf.text
imgs = pdf.imgs
show(pdf.imgs[0])
show(pdf.imgs[1])
############### other functions ####################
class lazyproperty:
def __init__(self, func):
self.func = func
def __get__(self, instance, cls):
if instance is None:
return self
else:
value = self.func(instance)
setattr(instance, self.func.__name__, value)
return value
def show(image):
import matplotlib.pyplot as plt
fig,ax = plt.subplots(1)
ax.imshow(image)
plt.show()