我正在尝试学习如何每 n 页拆分一个 pdf。
在我的情况下,我想将 64p PDF 拆分为几个块,每个块包含四页:文件 1:p.1-4,文件 2:p.5-8 等。
我试图理解 PyPDF2,但我的菜鸟让我不知所措:
from PyPDF2 import PdfFileWriter, PdfFileReader
pdf = PdfFileReader('my_pdf.pdf')
我想我需要使用和写入文件进行各种循环,addPage
直到没有页面为止?
有点晚了,但我在寻求帮助尝试做同样的事情时遇到了你的问题。我最终做了以下事情,这可以满足您的要求。请注意,这可能比您要求的要多,但答案就在那里。这是一个粗略的初稿,非常需要重构和一些变量重命名。
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
def split_pdf(in_pdf, step=1):
"""Splits a given pdf into seperate pdfs and saves
those to a supfolder of the parent pdf's folder, called
splitted_pdf.
Arguments:
in_pdf: [str] Absolute path (and filename) of the
input pdf or just the filename, if the file
is in the current directory.
step: [int] Desired number of pages in each of the
output pdfs.
Returns:
dunno yet
"""
#TODO: Add choice for output dir
#TODO: Add logging instead of prints
#TODO: Refactor
try:
with open(in_pdf, 'rb') as in_file:
input_pdf = PdfFileReader(in_file)
num_pages = input_pdf.numPages
input_dir, filename = os.path.split(in_pdf)
filename = os.path.splitext(filename)[0]
output_dir = input_dir + "/" + filename + "_splitted/"
os.mkdir(output_dir)
intervals = range(0, num_pages, step)
intervals = dict(enumerate(intervals, 1))
naming = f'{filename}_p'
count = 0
for key, val in intervals.items():
output_pdf = PdfFileWriter()
if key == len(intervals):
for i in range(val, num_pages):
output_pdf.addPage(input_pdf.getPage(i))
nums = f'{val + 1}' if step == 1 else f'{val + 1}-{val + step}'
with open(f'{output_dir}{naming}{nums}.pdf', 'wb') as outfile:
output_pdf.write(outfile)
print(f'{naming}{nums}.pdf written to {output_dir}')
count += 1
else:
for i in range(val, intervals[key + 1]):
output_pdf.addPage(input_pdf.getPage(i))
nums = f'{val + 1}' if step == 1 else f'{val + 1}-{val + step}'
with open(f'{output_dir}{naming}{nums}.pdf', 'wb') as outfile:
output_pdf.write(outfile)
print(f'{naming}{nums}.pdf written to {output_dir}')
count += 1
except FileNotFoundError as err:
print('Cannot find the specified file. Check your input:')
print(f'{count} pdf files written to {output_dir}')
希望它可以帮助你。
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
# Method to split the pdf at every given n pages.
def split_at_every(self,infile , step = 1):
# Copy the input file path to a local variable infile
input_pdf = PdfFileReader(open(infile, "rb"))
pdf_len = input_pdf.number_of_pages
# Get the complete file name along with its path and split the text to take only the first part.
fname = os.path.splitext(os.path.basename(infile))[0]
# Get the list of page numbers in the order of given step
# If there are 10 pages in a pdf, and the step is 2
# page_numbers = [0,2,4,6,8]
page_numbers = list(range(0,pdf_len,step))
# Loop through the pdf pages
for ind,val in enumerate(page_numbers):
# Check if the index is last in the given page numbers
# If the index is not the last one, carry on with the If block.
if(ind+1 != len(page_numbers)):
# Initialize the PDF Writer
output_1 = PdfFileWriter()
# Loop through the pdf pages starting from the value of current index till the value of next index
# Ex : page numbers = [0,2,4,6,8]
# If the current index is 0, loop from 1st page till the 2nd page in the pdf doc.
for page in range(page_numbers[ind], page_numbers[ind+1]):
# Get the data from the given page number
page_data = input_pdf.getPage(page)
# Add the page data to the pdf_writer
output_1.addPage(page_data)
# Frame the output file name
output_1_filename = '{}_page_{}.pdf'.format(fname, page + 1)
# Write the output content to the file and save it.
self.write_to_file(output_1_filename, output_1)
else:
output_final = PdfFileWriter()
output_final_filename = "Last_Pages"
# Loop through the pdf pages starting from the value of current index till the last page of the pdf doc.
# Ex : page numbers = [0,2,4,6,8]
# If the current index is 8, loop from 8th page till the last page in the pdf doc.
for page in range(page_numbers[ind], pdf_len):
# Get the data from the given page number
page_data = input_pdf.getPage(page)
# Add the page data to the pdf_writer
output_final.addPage(page_data)
# Frame the output file name
output_final_filename = '{}_page_{}.pdf'.format(fname, page + 1)
# Write the output content to the file and save it.
self.write_to_file(output_final_filename,output_final)