0

我正在尝试逐页提取pdf并将结果存储在字典中,如下所示:

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import re

def convert_pdf_to_txt(path):
    ps=dict()
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    i=1
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
        text = retstr.getvalue()
        ps[i]=re.sub(' +',' ',text)
        i+=1
    return ps

print convert_pdf_to_txt('Aak.pdf')[3]

但是无论我访问哪个页面,我都会获得所有以前的页面。请告诉我如何解决这个问题?

4

2 回答 2

0

这应该有效。

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO

import os

def set_interpreter():
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return { 'retstr': retstr, 'device': device, 'interpreter': interpreter }

def convert_pdf_to_txt(path):
    fp = file(path, 'rb')
    si = set_interpreter()
    retstr = si['retstr']
    device = si['device']
    interpreter = si['interpreter']
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    page_counter = 0

    for pageNumber, page in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages,password=password,caching=caching, check_extractable=True)):
        interpreter.process_page(page)
        fpp = file('pagetext_%d.txt' % page_counter, 'w+')
        fpp.write(retstr.getvalue())
        fpp.close()
        page_counter += 1
        si = set_interpreter()
        retstr = si['retstr']
        device = si['device']
        interpreter = si['interpreter']

    fp.close()
    device.close()
    retstr.close()
    return text

print convert_pdf_to_txt(os.path.dirname(os.path.realpath('filename.pdf')) + "/filename.pdf")
于 2016-10-19T11:24:56.163 回答
0
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import resolve1
from io import StringIO

import numpy as np

def read_pdf(file_path):
    """
    Function that reads a PDF file and returns a dictionary
    """
    
    rsrcmgr = PDFResourceManager()
    codec = 'utf-8'
    laparams = LAParams()
    fp = open(file_path, 'rb')
    password = ""
    maxpages = 0
    caching = True

    parser = PDFParser(fp)
    document = PDFDocument(parser)

    # Count of pages
    num_pages = resolve1(document.catalog["Pages"])["Count"]

    pages_dict = {}
    while num_pages > 0:

        retstr = StringIO()
        device = TextConverter(rsrcmgr, retstr, codec = codec, laparams = laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos = [num_pages - 1], maxpages = maxpages, password = password, caching = caching, check_extractable = True):

            interpreter.process_page(page)

        text = retstr.getvalue()
        pages_dict[num_pages] = text
        num_pages = num_pages - 1

        device.close()
        retstr.close()

    fp.close()
    
    return pages_dict

d = read_pdf("your_document.pdf")
for k, v in d.items():
  print(f"\n----------------------------------------------PAGE {k}----------------------------------------------\n")
  print(v)
于 2021-12-14T20:37:18.267 回答