这可能只是由于PyPdf2's
提取文本功能,但是当我运行下面的代码以重命名文件时,会出现很多最常见的词,例如“Nthe”、“Nfrom”和“Ncommunications”。我不确定我能做些什么来阻止这种情况的发生,或者如何解决它。
是什么导致了这样的问题?
N从哪里来?
其他 PDF 完全符合我的要求,所以我不知道从哪里开始。
import PyPDF2
import re
from collections import Counter
import os.path
files = [f for f in os.listdir('.') if os.path.isfile(f)]
files = filter(lambda f: f.endswith(('.pdf','.PDF')), files)
for file in files:
pdfFileObj = open('{0}'.format(file), 'rb') #Open the File
pdfReader = PyPDF2.PdfFileReader(pdfFileObj) #Read the file
frequency = {} #Create dict
ignore = {'the','a','if','in','it','of','or','and','for','can','that','are','this','your','you','will','not','have','its','with','need','has','from','more'} #Ignore dese ones
print "Number of Pages %s " % pdfReader.numPages #Print Num Pages
word_list = []
for i in range(pdfReader.numPages):
pageObj = pdfReader.getPage(i) # Get the first page
word_list.append(pageObj.extractText()) #Add the pages together
match_pattern = re.findall(r'\b[a-z]{3,15}\b', str(word_list)) #Find the text
cnt = Counter()
for words in match_pattern: #Start counting the frequency
words.lower() # Lower Case Words
if words not in ignore: #Ignore common words
count = frequency.get(words,0) #Start the count?
frequency[words] = count + 1 #Add one
fl = sorted(frequency, key=frequency.__getitem__, reverse = True)[:3] #Sort according to frequency
pdfFileObj.close() #Close the PDF
newtitle = ' '.join(map(str,fl, )).title() #Join the title list together
try:
print newtitle #Print the title
os.rename('{0}'.format(file), '{0}.pdf'.format(newtitle))#Rename the file
except:
print "Unable to Rename File"