此代码应该将任何 pdf 转换为文本。当代码运行时,它什么也没有提供。没有输出,没有错误。我无法理解原因。
import pytesseract
from pdf2image import convert_from_path
from pytesseract import image_to_string
def convert_pdf_to_img(pdf_file):
return convert_from_path(pdf_file,poppler_path=r'F:\poppler-21.11.0-h24fffdf_0\Library\bin')
def convert_image_to_text(file):
text = image_to_string(file)
return text
def get_text_from_any_pdf(pdf_file):
images = convert_pdf_to_img(pdf_file)
final_text = ""
for pg, img in enumerate(images):
final_text += convert_image_to_text(img)
#print("Page n°{}".format(pg))
#print(convert_image_to_text(img))
return final_text
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
path_to_pdf = 'new.pdf'
print(get_text_from_any_pdf(path_to_pdf))