1

我正在尝试使用 Camelot 解析 PDF 中的表格。单元格中有多行文本,有些单元格有一个空行分隔部分文本:

First line
Second line

Third line

我希望这被解析为First line\nSecond line\n\nThird line(注意双换行符),但我得到了这个:T\nFirst line\nSecond line\nhird line. 双换行符后的第一个字符移动到文本的开头,而我只得到一个换行符。

我也尝试使用制表符,但是当表中有一个空行时,它会弄乱整个表(实际上是数据框),并且在某些单词的情况下,它会在字符之间放置一个空格。

编辑:

我的主要问题是删除多个换行符。如果我知道空行在哪里,我可以从代码中修复另一个。

4

2 回答 2

0

我的朋友,你能检查一下这里的例子吗

https://camelot-py.readthedocs.io/en/master/user/advanced.html#improve-guessed-table-rows

表 = camelot.read_pdf('group_rows.pdf', flavor='stream', row_tol=10)

表[0].df

我用下面的代码解决了同样的问题

表格 = camelot.read_pdf(文件,风味 = 'stream',table_areas=['24,618,579,93'],columns=['67,315,369,483,571'],row_tol=10,strip_text='t\r\n\v')

于 2020-10-18T20:10:11.117 回答
0

在双换行的情况下,我也遇到了同样的问题。在您的情况下,它正在切换字符。我花了一些时间查看代码,并进行了一些更改并修复了问题。您可以使用以下代码。添加以下代码后,不要使用 camelot.read_pdf,而是使用我制作的自定义方法 read_pdf_custom() 为了更好的体验,我建议您使用 camelot v==0.8.2

import sys
import warnings
from camelot import read_pdf
from camelot import handlers
from camelot.core import TableList
from camelot.parsers import Lattice
from camelot.parsers.base import BaseParser
from camelot.core import Table
import camelot
from camelot.utils import validate_input, remove_extra,TemporaryDirectory,get_page_layout,get_text_objects,get_rotation,is_url,download_url,scale_image,scale_pdf,segments_in_bbox,text_in_bbox,merge_close_lines,get_table_index,compute_accuracy,compute_whitespace

    
    
from camelot.image_processing import (
    adaptive_threshold,
    find_lines,
    find_contours,
    find_joints,
)

class custom_lattice(Lattice):
    def _generate_columns_and_rows(self, table_idx, tk):
        # select elements which lie within table_bbox
        t_bbox = {}
        v_s, h_s = segments_in_bbox(
            tk, self.vertical_segments, self.horizontal_segments
        )
        custom_horizontal_indexes=[]
        custom_vertical_indexes=[]
        for zzz in self.horizontal_text:
            try:
                h_extracted_text=self.find_between(str(zzz),"'","'").strip()
                h_text_index=self.find_between(str(zzz),"LTTextLineHorizontal","'").strip().split(",")
                custom_horizontal_indexes.append(h_text_index[1])
            except:
                pass



        inserted=0
        for xxx in self.vertical_text:
            v_extracted_text=self.find_between(str(xxx),"'","'").strip()
            v_text_index=self.find_between(str(xxx),"LTTextLineVertical","'").strip().split(",")
            custom_vertical_indexes.append(v_text_index[1])
            vertical_second_index=v_text_index[1]
            try:
                horizontal_index=custom_horizontal_indexes.index(vertical_second_index)
                self.horizontal_text.insert(horizontal_index,xxx)
            except Exception as exxx:
                pass
        self.vertical_text=[]
        t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
        t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)

        t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
        t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))

        self.t_bbox = t_bbox

        cols, rows = zip(*self.table_bbox[tk])
        cols, rows = list(cols), list(rows)
        cols.extend([tk[0], tk[2]])
        rows.extend([tk[1], tk[3]])
        cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
        rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol)
        cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]

        return cols, rows, v_s, h_s
    
    
    def _generate_table(self, table_idx, cols, rows, **kwargs):
        print("\n")
        v_s = kwargs.get("v_s")
        h_s = kwargs.get("h_s")
        if v_s is None or h_s is None:
            raise ValueError("No segments found on {}".format(self.rootname))

        table = Table(cols, rows)
        table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
        table = table.set_border()
        table = table.set_span()


        pos_errors = []
        for direction in ["vertical", "horizontal"]:
            for t in self.t_bbox[direction]:
                indices, error = get_table_index(
                    table,
                    t,
                    direction,
                    split_text=self.split_text,
                    flag_size=self.flag_size,
                    strip_text=self.strip_text,
                )
                if indices[:2] != (-1, -1):
                    pos_errors.append(error)
                    indices = Lattice._reduce_index(
                        table, indices, shift_text=self.shift_text
                    )
                    for r_idx, c_idx, text in indices:
                        temp_text=text.strip().replace("\n","")
                        if len(temp_text)==1:
                            text=temp_text

                        table.cells[r_idx][c_idx].text = text
        accuracy = compute_accuracy([[100, pos_errors]])
        if self.copy_text is not None:
            table = Lattice._copy_spanning_text(table, copy_text=self.copy_text)

        data = table.data
        table.df = pd.DataFrame(data)
        table.shape = table.df.shape

        whitespace = compute_whitespace(data)
        table.flavor = "lattice"
        table.accuracy = accuracy
        table.whitespace = whitespace
        table.order = table_idx + 1
        table.page = int(os.path.basename(self.rootname).replace("page-", ""))

        # for plotting
        _text = []
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
        table._text = _text
        table._image = (self.image, self.table_bbox_unscaled)
        table._segments = (self.vertical_segments, self.horizontal_segments)
        table._textedges = None

        return table
        
class PDFHandler(handlers.PDFHandler):
    def parse(
        self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
    ):

        tables = []
        with TemporaryDirectory() as tempdir:
            for p in self.pages:
                self._save_page(self.filepath, p, tempdir)
            pages = [
                os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
            ]
            parser = custom_lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
            for p in pages:
                t = parser.extract_tables(
                    p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
                )
                tables.extend(t)
        return TableList(sorted(tables))

    

def read_pdf_custom(
    filepath,
    pages="1",
    password=None,
    flavor="lattice",
    suppress_stdout=False,
    layout_kwargs={},
    **kwargs
):

    if flavor not in ["lattice", "stream"]:
        raise NotImplementedError(
            "Unknown flavor specified." " Use either 'lattice' or 'stream'"
        )

    with warnings.catch_warnings():
        if suppress_stdout:
            warnings.simplefilter("ignore")

        validate_input(kwargs, flavor=flavor)
        p = PDFHandler(filepath, pages=pages, password=password)
        kwargs = remove_extra(kwargs, flavor=flavor)
        tables = p.parse(
            flavor=flavor,
            suppress_stdout=suppress_stdout,
            layout_kwargs=layout_kwargs,
            **kwargs
        )
        return tables
于 2020-11-21T17:53:02.977 回答