0

此代码应该将任何 pdf 转换为文本。当代码运行时,它什么也没有提供。没有输出,没有错误。我无法理解原因。

import pytesseract

from pdf2image import convert_from_path

from pytesseract import image_to_string

def convert_pdf_to_img(pdf_file):
    
    return convert_from_path(pdf_file,poppler_path=r'F:\poppler-21.11.0-h24fffdf_0\Library\bin')


def convert_image_to_text(file):
    
    text = image_to_string(file)
    return text

def get_text_from_any_pdf(pdf_file):
   
    images = convert_pdf_to_img(pdf_file)
    final_text = ""
    for pg, img in enumerate(images):
        
        final_text += convert_image_to_text(img)
        #print("Page n°{}".format(pg))
        #print(convert_image_to_text(img))
    
    return final_text

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

path_to_pdf = 'new.pdf'

print(get_text_from_any_pdf(path_to_pdf))
4

0 回答 0