1

我正在使用 PdfBox 的 .net 进行解析以从 pdf 中提取文本以及文本位置。为此,在搜索时我发现了以下 java 代码:

PDFTextStripper stripper = new PDFTextStripper()
{
    @Override
    protected void writeString(String text, List<TextPosition> textPositions) throws IOException
    {
        super.writeString(text, textPositions);

        TextPosition firstProsition = textPositions.get(0);
        TextPosition lastPosition = textPositions.get(textPositions.size() - 1);
        writeString(String.format("[%s - %s / %s]", firstProsition.getXDirAdj(), lastPosition.getXDirAdj() + lastPosition.getWidthDirAdj(), firstProsition.getYDirAdj()));
    }
};
stripper.setSortByPosition(true);
return stripper.getText(document);

我通过以下方式将其转换为 .net:

class PDFTextLocationStripper : PDFTextStripper
{
    public string textWithPostion = "";
    protected override void processTextPosition(TextPosition text)
    {
            textWithPostion += "String[" + text.getXDirAdj() + "," +
            text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale=" +
            text.getXScale() + " height=" + text.getHeightDir() + " space=" +
            text.getWidthOfSpace() + " width=" +
            text.getWidthDirAdj() + "]" + text.getCharacter();
    }

    protected override void writeString(java.lang.String text, java.util.List textPositions) 
    {
            base.writeString(text, textPositions);
            TextPosition firstProsition = (TextPosition)textPositions.get(0);
            TextPosition lastPosition =(TextPosition) textPositions.get(textPositions.size() - 1);
            writeString(String.Format("[%s - %s / %s]", firstProsition.getXDirAdj(), lastPosition.getXDirAdj() + lastPosition.getWidthDirAdj(), firstProsition.getYDirAdj()));
    }

}

但是,我收到上述代码的编译错误:

错误 1 ​​方法“writeString”没有重载需要 2 个参数

错误 2 'PDFTextLocationStripper.writeString(java.lang.String, java.util.List)':找不到合适的方法来覆盖

那么,如何覆盖 writeString 方法以便可以提取文本和位置?

4

1 回答 1

0

因为,我无法重载 writeString 方法。我使用 processTextPosition 从 pdf 中提取单词及其位置。代码如下:

class PDFTextLocationStripper : PDFTextStripper
    {
        public string textWithPostion = "";
        public Dictionary<float, Dictionary<float, PdfWord>> pdfWordsByXByY;

        public PDFTextLocationStripper(): base()
        {
            try
            {
                textWithPostion = "";
                pdfWordsByXByY = new Dictionary<float, Dictionary<float, PdfWord>>();
            }
            catch (Exception ex)
            {

            }
        }

        protected override void processTextPosition(TextPosition text)
        {
            try
            {
                float textX = text.getXDirAdj();
                float textY = text.getYDirAdj();
                if (!String.IsNullOrWhiteSpace(text.getCharacter()))
                {
                    if (pdfWordsByXByY.ContainsKey(textY))
                    {
                        Dictionary<float, PdfWord> wordsByX = pdfWordsByXByY[textY];
                        if (wordsByX.ContainsKey(textX))
                        {
                            PdfWord word = wordsByX[textX];
                            wordsByX.Remove(word.Right);
                            word.EndCharWidth = text.getWidthDirAdj();
                            word.Height = text.getHeightDir();
                            word.EndX = textX;
                            word.Text += text.getCharacter();
                            if (!wordsByX.Keys.Contains(word.Right))
                            {
                                wordsByX.Add(word.Right, word);
                            }
                        }
                        else
                        {
                            float requiredX = -1;
                            float minDiff = float.MaxValue;
                            for (int index = 0; index < wordsByX.Keys.Count; index++)
                            {
                                float key = wordsByX.Keys.ElementAt(index);
                                float diff = key - textX;
                                if (diff < 0)
                                {
                                    diff = -diff;
                                }
                                if (diff < minDiff)
                                {
                                    minDiff = diff;
                                    requiredX = key;
                                }
                            }
                            if (requiredX > -1 && minDiff <= 1)
                            {
                                PdfWord word = wordsByX[requiredX];
                                wordsByX.Remove(requiredX);
                                word.EndCharWidth = text.getWidthDirAdj();
                                word.Height = text.getHeightDir();
                                word.EndX = textX;
                                word.Text += text.getCharacter();
                                if (!wordsByX.ContainsKey(word.Right))
                                {
                                    wordsByX.Add(word.Right, word);
                                }
                            }
                            else
                            {
                                PdfWord word = new PdfWord();
                                word.Text = text.getCharacter();
                                word.EndX = word.StartX = textX;
                                word.Y = textY;
                                word.EndCharWidth = word.StartCharWidth = text.getWidthDirAdj();
                                word.Height = text.getHeightDir();
                                if (!wordsByX.ContainsKey(word.Right))
                                {
                                    wordsByX.Add(word.Right, word);
                                }
                                pdfWordsByXByY[textY] = wordsByX;
                            }
                        }
                    }
                    else
                    {
                        Dictionary<float, PdfWord> wordsByX = new Dictionary<float, PdfWord>();
                        PdfWord word = new PdfWord();
                        word.Text = text.getCharacter();
                        word.EndX = word.StartX = textX;
                        word.Y = textY;
                        word.EndCharWidth = word.StartCharWidth = text.getWidthDirAdj();
                        word.Height = text.getHeightDir();
                        wordsByX.Add(word.Right, word);
                        pdfWordsByXByY.Add(textY, wordsByX);
                    }
                }
            }
            catch (Exception ex)
            {

            }
        }
    }

这是 PdfWord 类。

 class PdfWord
    {
        public float StartX { get; set; }
        public float EndX { get; set; }
        public float Y { get; set; }
        public float StartCharWidth { get; set; }
        public float EndCharWidth { get; set; }
        public float Height { get; set; }
        public string Text { get; set; }
        public float Right { get { return EndX + EndCharWidth; } }
    }
于 2017-02-09T06:44:37.330 回答