我正在从 PDF 文件中提取文本,并在提取时将该文本转换为 HTML。
这样做时,文本会出现重叠,从而导致阅读变得非常困难。我正在使用 itextsharp 执行提取。
这段代码中的什么导致了该错误?
public void Extract_inputpdf()
{
string pdf_of_inputFile = targetPathip;
StringBuilder sb_inputpdf = new StringBuilder();
PdfReader reader_inputPdf = new PdfReader(LblFleip.Text); //read PDF
divip.InnerHtml = "";
divop.InnerHtml = "";
sb_inputpdf.Length = 0;
text_output_File = string.Empty;
text_output_File_report = string.Empty;
text_input_File = string.Empty;
text_input_File_report = string.Empty;
input_pdf = string.Empty;
input_pdf_report = string.Empty;
output_pdf = string.Empty;
output_pdf_report = string.Empty;
rtbxinput_box_input = string.Empty;
rtbxinput_box_output = string.Empty;
Pagesize_input = 0;
FinalPagesize_input = 0;
Pagesize_output = 0;
FinalPagesize_output = 0;
for (i = 1;i <= reader_inputPdf.NumberOfPages;i++)
{
Pagesize_input = FinalPagesize_input;
height_input = reader_inputPdf.GetPageSizeWithRotation(i).Height;
pagenumber_input = i;
TextWithFont_inputPdf inputpdf = new TextWithFont_inputPdf();
text_input_File = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader_inputPdf, i, inputpdf);
sb_inputpdf.Append(text_input_File);
divip.InnerHtml = sb_inputpdf.ToString();
input_pdf = sb_inputpdf.ToString();
}
reader_inputPdf.Close();
}
//read input pdf ----------
public class TextWithFont_inputPdf : iTextSharp.text.pdf.parser.ITextExtractionStrategy
{
private StringBuilder result = new StringBuilder();
private Vector lastBaseLine;
private string lastFont;
private float lastFontSize;
public void RenderText(iTextSharp.text.pdf.parser.TextRenderInfo renderInfo)
{
string curFont = renderInfo.GetFont().PostscriptFontName;
string curfontweight = renderInfo.GetFont().PostscriptFontName;
float height_extract_input = Publishing.height_input;
int pagenumber_input = Publishing.i;//page number
string fontweight = string.Empty;
string fontstyle = string.Empty;
string presentfont = string.Empty;
string divide = curFont;
string[] fontnames = null;
//--------------------------------------------------------
Vector curBaseline = renderInfo.GetBaseline().GetStartPoint();
Vector topRight = renderInfo.GetAscentLine().GetEndPoint();
y_direction_input = 10 + Publishing.Pagesize_input + (height_extract_input - curBaseline[Vector.I2]);
Publishing.FinalPagesize_input = y_direction_input;
iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(curBaseline[Vector.I1], curBaseline[Vector.I2], topRight[Vector.I1], topRight[Vector.I2]);
Single curFontSize = rect.Height;
//---------------------------------------
if ((this.lastBaseLine == null) || (curBaseline[Vector.I2] != lastBaseLine[Vector.I2]) || (curFontSize != lastFontSize) || (curFont != lastFont))
{
if ((this.lastBaseLine != null))
{
this.result.AppendLine("</span>");
}
if ((this.lastBaseLine != null) && curBaseline[Vector.I2] != lastBaseLine[Vector.I2])
{
//this.result.AppendLine("<br />");
}
this.result.AppendFormat("<span style=\"font-family:{0};font-weight:{1};font-style:{2};margin-left:{3}pt;top:{4}pt; position:absolute;\">", curFont, fontweight, fontstyle, curBaseline[Vector.I1], y_direction_input);
}
this.result.Append(renderInfo.GetText());
this.lastBaseLine = curBaseline;
this.lastFontSize = curFontSize;
this.lastFont = curFont;
}
catch (Exception ex)
{
logWriter.Error("TextWithFont_inputPdf() : " + ex.Message);
}
}
public string GetResultantText()
{
if (result.Length > 0)
{
result.Append("</span>");
}
return result.ToString();
}
public void BeginTextBlock() { }
public void EndTextBlock() { }
public void RenderImage(ImageRenderInfo renderInfo) { }
}