2

我正在从 PDF 文件中提取文本,并在提取时将该文本转换为 HTML。

这样做时,文本会出现重叠,从而导致阅读变得非常困难。我正在使用 itextsharp 执行提取。

这段代码中的什么导致了该错误?

    public void Extract_inputpdf()
    {

    string pdf_of_inputFile = targetPathip;


    StringBuilder sb_inputpdf = new StringBuilder();
    PdfReader reader_inputPdf = new PdfReader(LblFleip.Text); //read PDF

    divip.InnerHtml = "";

    divop.InnerHtml = "";

    sb_inputpdf.Length = 0;
    text_output_File = string.Empty;
    text_output_File_report = string.Empty;
    text_input_File = string.Empty;
    text_input_File_report = string.Empty;
    input_pdf = string.Empty;

    input_pdf_report = string.Empty;
    output_pdf = string.Empty;
    output_pdf_report = string.Empty;
    rtbxinput_box_input = string.Empty;
    rtbxinput_box_output = string.Empty;
    Pagesize_input = 0;
    FinalPagesize_input = 0;
    Pagesize_output = 0;
    FinalPagesize_output = 0;

    for (i = 1;i <= reader_inputPdf.NumberOfPages;i++)
    {
        Pagesize_input = FinalPagesize_input;
        height_input = reader_inputPdf.GetPageSizeWithRotation(i).Height;
        pagenumber_input = i;

        TextWithFont_inputPdf inputpdf = new TextWithFont_inputPdf();

        text_input_File = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader_inputPdf, i, inputpdf);

        sb_inputpdf.Append(text_input_File);

        divip.InnerHtml = sb_inputpdf.ToString();

        input_pdf = sb_inputpdf.ToString();
    }

    reader_inputPdf.Close();
}

//read input pdf ----------


public class TextWithFont_inputPdf : iTextSharp.text.pdf.parser.ITextExtractionStrategy
{

    private StringBuilder result = new StringBuilder();

    private Vector lastBaseLine;
    private string lastFont;
    private float lastFontSize;

    public void RenderText(iTextSharp.text.pdf.parser.TextRenderInfo renderInfo)
    {

        string curFont = renderInfo.GetFont().PostscriptFontName;
        string curfontweight = renderInfo.GetFont().PostscriptFontName;
        float height_extract_input = Publishing.height_input;

        int pagenumber_input = Publishing.i;//page number

        string fontweight = string.Empty;
        string fontstyle = string.Empty;
        string presentfont = string.Empty;
        string divide = curFont;
        string[] fontnames = null;

        //--------------------------------------------------------     
        Vector curBaseline = renderInfo.GetBaseline().GetStartPoint();

        Vector topRight = renderInfo.GetAscentLine().GetEndPoint();
        y_direction_input = 10 + Publishing.Pagesize_input + (height_extract_input - curBaseline[Vector.I2]);
        Publishing.FinalPagesize_input = y_direction_input;

        iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(curBaseline[Vector.I1], curBaseline[Vector.I2], topRight[Vector.I1], topRight[Vector.I2]);
        Single curFontSize = rect.Height;

            //---------------------------------------
            if ((this.lastBaseLine == null) || (curBaseline[Vector.I2] != lastBaseLine[Vector.I2]) || (curFontSize != lastFontSize) || (curFont != lastFont))
            {

                if ((this.lastBaseLine != null))
                {
                    this.result.AppendLine("</span>");
                }

                if ((this.lastBaseLine != null) && curBaseline[Vector.I2] != lastBaseLine[Vector.I2])
                {
                    //this.result.AppendLine("<br />");
                }

                this.result.AppendFormat("<span style=\"font-family:{0};font-weight:{1};font-style:{2};margin-left:{3}pt;top:{4}pt; position:absolute;\">", curFont, fontweight, fontstyle, curBaseline[Vector.I1], y_direction_input); 

            }
            this.result.Append(renderInfo.GetText());

            this.lastBaseLine = curBaseline;
            this.lastFontSize = curFontSize;

            this.lastFont = curFont;

        }
        catch (Exception ex)
        {
            logWriter.Error("TextWithFont_inputPdf() : " + ex.Message);
        }
    }


    public string GetResultantText()
    {

        if (result.Length > 0)
        {
            result.Append("</span>");
        }
        return result.ToString();
    }


    public void BeginTextBlock() { }
    public void EndTextBlock() { }
    public void RenderImage(ImageRenderInfo renderInfo) { }
}
4

0 回答 0