Tika是 Python 包之一,可用于从 PDF 文件中提取数据。
在下面的示例中,我使用Tika
和正则表达式来提取这五个数据元素:
import re as regex
from tika import parser
parse_entire_pdf = parser.from_file('2022251527199.pdf', xmlContent=True)
for key, values in parse_entire_pdf.items():
if key == 'content':
bid_number = regex.search(r'(Bid Number:)\W(GEM\W\d{4}\W[A-Z]\W\d+)', values)
print(bid_number.group(2))
GEM/2022/B/1916455
bid_end_date = regex.search(r'(Bid End Date\WTime)\W(\d{2}-\d{2}-\d{4}\W\d{2}:\d{2}:\d{2})', values)
print(bid_end_date.group(2))
21-02-2022 15:00:00
org_name = regex.search(r'(Organisation Name)\W(.*)', values)
print(org_name.group(2))
State Election Commission (sec), Gujarat
item_category = regex.search(r'(Item Category)\W(.*)', values)
print(item_category.group(2))
Desktop Computers (Q2) , Computer Printers (Q2)
total_quantity = regex.search(r'(Total Quantity)\W(\d+)', values)
print(total_quantity.group(2))
18
这是将提取的数据写入 CSV 文件的一种方法:
import csv
import re as regex
from tika import parser
document_elements = []
# processing 2 documents
documents = ['202225114747453.pdf', '2022251527199.pdf']
for doc in documents:
parse_entire_pdf = parser.from_file(doc, xmlContent=True)
for key, values in parse_entire_pdf.items():
if key == 'content':
bid_number = regex.search(r'(Bid Number:)\W(GEM\W\d{4}\W[A-Z]\W\d+)', values)
bid_end_date = regex.search(r'(Bid End Date\WTime)\W(\d{2}-\d{2}-\d{4}\W\d{2}:\d{2}:\d{2})', values)
org_name = regex.search(r'(Organisation Name)\W(.*)', values)
item_category = regex.search(r'(Item Category)\W(.*)', values)
total_quantity = regex.search(r'(Total Quantity)\W(\d+)', values)
document_elements.append([bid_number.group(2),
bid_end_date.group(2),
org_name.group(2),
item_category.group(2),
total_quantity.group(2)])
with open("out.csv", "w", newline="") as f:
headerList = ['bid_number', 'bid_end_date', 'org_name', 'item_category', 'total_quantity']
writer = csv.writer(f)
writer.writerow(headerList)
writer.writerows(document_elements)
这是您在评论中要求的附加代码。
import os
import re as regex
from tika import parser
document_elements = []
image_directory = "pdf_files"
image_directory_abspath = os.path.abspath(image_directory)
for dirpath, dirnames, filenames in os.walk(image_directory_abspath):
for filename in [f for f in filenames if f.endswith(".pdf")]:
parse_entire_pdf = parser.from_file(os.path.join(dirpath, filename), xmlContent=True)
for key, values in parse_entire_pdf.items():
if key == 'content':
bid_number = regex.search(r'(Bid Number:)\W(GEM\W\d{4}\W[A-Z]\W\d+)', values)
bid_end_date = regex.search(r'(Bid End Date\WTime)\W(\d{2}-\d{2}-\d{4}\W\d{2}:\d{2}:\d{2})', values)
org_name = regex.search(r'(Organisation Name)\W(.*)', values)
item_category = regex.search(r'(Item Category)\W(.*)', values)
total_quantity = regex.search(r'(Total Quantity)\W(\d+)', values)
document_elements.append([bid_number.group(2),
bid_end_date.group(2),
org_name.group(2),
item_category.group(2),
total_quantity.group(2)])
with open("out.csv", "w", newline="") as f:
headerList = ['bid_number', 'bid_end_date', 'org_name', 'item_category', 'total_quantity']
writer = csv.writer(f)
writer.writerow(headerList)
writer.writerows(document_elements)
特别说明: 我注意到有些 PDF 没有 org_name,因此您必须弄清楚如何使用N/A, None, or Null