在双换行的情况下,我也遇到了同样的问题。在您的情况下,它正在切换字符。我花了一些时间查看代码,并进行了一些更改并修复了问题。您可以使用以下代码。添加以下代码后,不要使用 camelot.read_pdf,而是使用我制作的自定义方法 read_pdf_custom() 为了更好的体验,我建议您使用 camelot v==0.8.2
import sys
import warnings
from camelot import read_pdf
from camelot import handlers
from camelot.core import TableList
from camelot.parsers import Lattice
from camelot.parsers.base import BaseParser
from camelot.core import Table
import camelot
from camelot.utils import validate_input, remove_extra,TemporaryDirectory,get_page_layout,get_text_objects,get_rotation,is_url,download_url,scale_image,scale_pdf,segments_in_bbox,text_in_bbox,merge_close_lines,get_table_index,compute_accuracy,compute_whitespace
from camelot.image_processing import (
adaptive_threshold,
find_lines,
find_contours,
find_joints,
)
class custom_lattice(Lattice):
def _generate_columns_and_rows(self, table_idx, tk):
# select elements which lie within table_bbox
t_bbox = {}
v_s, h_s = segments_in_bbox(
tk, self.vertical_segments, self.horizontal_segments
)
custom_horizontal_indexes=[]
custom_vertical_indexes=[]
for zzz in self.horizontal_text:
try:
h_extracted_text=self.find_between(str(zzz),"'","'").strip()
h_text_index=self.find_between(str(zzz),"LTTextLineHorizontal","'").strip().split(",")
custom_horizontal_indexes.append(h_text_index[1])
except:
pass
inserted=0
for xxx in self.vertical_text:
v_extracted_text=self.find_between(str(xxx),"'","'").strip()
v_text_index=self.find_between(str(xxx),"LTTextLineVertical","'").strip().split(",")
custom_vertical_indexes.append(v_text_index[1])
vertical_second_index=v_text_index[1]
try:
horizontal_index=custom_horizontal_indexes.index(vertical_second_index)
self.horizontal_text.insert(horizontal_index,xxx)
except Exception as exxx:
pass
self.vertical_text=[]
t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
self.t_bbox = t_bbox
cols, rows = zip(*self.table_bbox[tk])
cols, rows = list(cols), list(rows)
cols.extend([tk[0], tk[2]])
rows.extend([tk[1], tk[3]])
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
return cols, rows, v_s, h_s
def _generate_table(self, table_idx, cols, rows, **kwargs):
print("\n")
v_s = kwargs.get("v_s")
h_s = kwargs.get("h_s")
if v_s is None or h_s is None:
raise ValueError("No segments found on {}".format(self.rootname))
table = Table(cols, rows)
table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
table = table.set_border()
table = table.set_span()
pos_errors = []
for direction in ["vertical", "horizontal"]:
for t in self.t_bbox[direction]:
indices, error = get_table_index(
table,
t,
direction,
split_text=self.split_text,
flag_size=self.flag_size,
strip_text=self.strip_text,
)
if indices[:2] != (-1, -1):
pos_errors.append(error)
indices = Lattice._reduce_index(
table, indices, shift_text=self.shift_text
)
for r_idx, c_idx, text in indices:
temp_text=text.strip().replace("\n","")
if len(temp_text)==1:
text=temp_text
table.cells[r_idx][c_idx].text = text
accuracy = compute_accuracy([[100, pos_errors]])
if self.copy_text is not None:
table = Lattice._copy_spanning_text(table, copy_text=self.copy_text)
data = table.data
table.df = pd.DataFrame(data)
table.shape = table.df.shape
whitespace = compute_whitespace(data)
table.flavor = "lattice"
table.accuracy = accuracy
table.whitespace = whitespace
table.order = table_idx + 1
table.page = int(os.path.basename(self.rootname).replace("page-", ""))
# for plotting
_text = []
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text
table._image = (self.image, self.table_bbox_unscaled)
table._segments = (self.vertical_segments, self.horizontal_segments)
table._textedges = None
return table
class PDFHandler(handlers.PDFHandler):
def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
):
tables = []
with TemporaryDirectory() as tempdir:
for p in self.pages:
self._save_page(self.filepath, p, tempdir)
pages = [
os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
]
parser = custom_lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
for p in pages:
t = parser.extract_tables(
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
)
tables.extend(t)
return TableList(sorted(tables))
def read_pdf_custom(
filepath,
pages="1",
password=None,
flavor="lattice",
suppress_stdout=False,
layout_kwargs={},
**kwargs
):
if flavor not in ["lattice", "stream"]:
raise NotImplementedError(
"Unknown flavor specified." " Use either 'lattice' or 'stream'"
)
with warnings.catch_warnings():
if suppress_stdout:
warnings.simplefilter("ignore")
validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages=pages, password=password)
kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse(
flavor=flavor,
suppress_stdout=suppress_stdout,
layout_kwargs=layout_kwargs,
**kwargs
)
return tables