java - pfdbox 不同的pdf版本、编码、字体类型

Question

“问题是我无法阅读我自己的 pdf 文件。”

我正在测试用于编辑 pdf 文件的 pdfbox java 工具箱。我制作了一些 pdf 文件，对于 pdfbox 来说似乎都不同。所以使用 pdfbox 我逐页打开一个文件，我阅读文本（如果是 Tj 或 TJ）并在需要时替换它。我有一个带有文本的模板：“此文件由 [用户名] 制作”，并且使用我的程序，我可以用我的数据库中的用户替换 [用户名]。

现在我从谷歌文档（下载为pdf）制作了pdf文件，它在pdfbox中创建了空行，如下所示：

//this is an op with Tj
COSString previous = (COSString) tokens.get(j - 1);
String string = previous.getString();

比我使用 libre office（在 ubuntu 上）创建了一个 pdf，现在它是一个需要此代码的 TJ：

COSArray previous = (COSArray) tokens.get(j - 1);
for (int k = 0; k < previous.size(); k++) {
    Object arrElement = previous.getObject(k);
    if (arrElement instanceof COSString) {
        COSString cosString = (COSString) arrElement;
        String string = cosString.getString();
    }
}

这会给我奇怪的家伙。

当我使用 windowx docx 文档（我在某处的电子邮件中找到）并使用在线转换器对其进行转换时，它也具有 TJ 元素并且效果很好（我看到了文本）。

我的问题是：不同的pdf有什么区别（pdf版本1.3/1.4，字体，编码，更多？？）。更重要的是 pdfbox 理解什么或者我如何阅读其他文件？有一些元数据吗？我可以在 pdfbox 中设置编码/版本/字体类型吗？

谢谢，

提比

ps在这里我的完整（正在进行的代码不是那么好）：

package nl.tibi.pdfboxhelper;

import java.io.IOException;
import java.io.OutputStream;
import java.util.List;

import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.exceptions.COSVisitorException;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.graphics.PDGraphicsState;
import org.apache.pdfbox.util.PDFOperator;
import org.apache.pdfbox.util.PDFTextStripper;

/**
 * Hello world!
 */
public class PdfStringReplacer {

    /**
     * Locate a string in a PDF and replace it with a new string.
     * 
     * @param inputFile
     *            The PDF to open.
     * @param outputFile
     *            The PDF to write to.
     * @param strToFind
     *            The string to find in the PDF document.
     * @param message
     *            The message to write in the file.
     * @throws IOException
     *             If there is an error writing the data.
     * @throws COSVisitorException
     *             If there is an error writing the PDF.
     */
    public void replaceString(String inputFile, String outputFile, String strToFind, String message) throws IOException, COSVisitorException {
        // the document
        PDDocument doc = null;
        try {
            doc = PDDocument.load(inputFile);
            PDFTextStripper textStripper = new PDFTextStripper();
            System.out.println(textStripper.getText(doc));
            PDGraphicsState graphicsState = null;

            PDDocumentInformation info = doc.getDocumentInformation();
            System.out.println("Page Count=" + doc.getNumberOfPages());
            System.out.println("Title=" + info.getTitle());
            System.out.println("Author=" + info.getAuthor());
            System.out.println("Subject=" + info.getSubject());
            System.out.println("Keywords=" + info.getKeywords());
            System.out.println("Creator=" + info.getCreator());
            System.out.println("Producer=" + info.getProducer());
            System.out.println("Creation Date=" + info.getCreationDate());
            System.out.println("Modification Date=" + info.getModificationDate());
            System.out.println("Trapped=" + info.getTrapped());
            System.out.println("isNeedToBeUpdate=" + info.getMetadataKeys());
            PDDocumentCatalog catalog = doc.getDocumentCatalog();
            PDMetadata metadata = catalog.getMetadata();
            List pages = doc.getDocumentCatalog().getAllPages();
            for (int i = 0; i < pages.size(); i++) {
                PDPage page = (PDPage) pages.get(i);
                graphicsState = new PDGraphicsState(page.findCropBox());
                PDFont font = graphicsState.getTextState().getFont();
                if (font == null) {
                    font = new PDType1Font();
                }
                PDStream contents = page.getContents();
                PDFStreamParser parser = new PDFStreamParser(contents.getStream());
                parser.parse();
                List tokens = parser.getTokens();
                for (int j = 0; j < tokens.size(); j++) {
                    Object next = tokens.get(j);
                    if (next instanceof PDFOperator) {
                        PDFOperator op = (PDFOperator) next;
                        // Tj and TJ are the two operators that display
                        // strings in a PDF
                        if (op.getOperation().equals("Tj")) {
                            // Tj takes one operator and that is the string
                            // to display so lets update that operator
                            COSString previous = (COSString) tokens.get(j - 1);
                            String string = previous.getString(); // new String(previous.getBytes(), "ISO-8859-1");
                            string = string.replaceFirst(strToFind, message);
                            previous.reset();
                            System.out.println("p: " + string + " <=> " + previous.getHexString() + " <->" + new String(previous.getBytes(), "UTF-16BE")
                                    + " <->" + new String(previous.getBytes(), "ISO-8859-1"));
                            previous.append(string.getBytes("ISO-8859-1"));// "test".getBytes());
                        } else if (op.getOperation().equals("TJ")) {
                            COSArray previous = (COSArray) tokens.get(j - 1);
                            for (int k = 0; k < previous.size(); k++) {
                                Object arrElement = previous.getObject(k);
                                if (arrElement instanceof COSString) {
                                    COSString cosString = (COSString) arrElement;
                                    String string = cosString.getString();
                                    String c = font.encode(cosString.getBytes(), 0, 1);
                                    System.out.println(c + " " + string + " <=> " + cosString.getHexString() + " <->"
                                            + new String(cosString.getBytes(), "UTF-16BE") + " <->" + new String(cosString.getBytes(), "ISO-8859-1"));
                                    string = string.replaceFirst(strToFind, message);
                                    cosString.reset();
                                    cosString.append(string.getBytes("ISO-8859-1"));
                                }
                            }
                        }
                    }
                }
                // now that the tokens are updated we will replace the
                // page content stream.
                PDStream updatedStream = new PDStream(doc);
                OutputStream out = updatedStream.createOutputStream();
                ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
                tokenWriter.writeTokens(tokens);
                page.setContents(updatedStream);
            }
            doc.save(outputFile);
        } finally {
            if (doc != null) {
                doc.close();
            }
        }
    }
}

java - pfdbox 不同的pdf版本、编码、字体类型

0 回答 0

Related

Reference