“问题是我无法阅读我自己的 pdf 文件。”
我正在测试用于编辑 pdf 文件的 pdfbox java 工具箱。我制作了一些 pdf 文件,对于 pdfbox 来说似乎都不同。所以使用 pdfbox 我逐页打开一个文件,我阅读文本(如果是 Tj 或 TJ)并在需要时替换它。我有一个带有文本的模板:“此文件由 [用户名] 制作”,并且使用我的程序,我可以用我的数据库中的用户替换 [用户名]。
现在我从谷歌文档(下载为pdf)制作了pdf文件,它在pdfbox中创建了空行,如下所示:
//this is an op with Tj
COSString previous = (COSString) tokens.get(j - 1);
String string = previous.getString();
比我使用 libre office(在 ubuntu 上)创建了一个 pdf,现在它是一个需要此代码的 TJ:
COSArray previous = (COSArray) tokens.get(j - 1);
for (int k = 0; k < previous.size(); k++) {
Object arrElement = previous.getObject(k);
if (arrElement instanceof COSString) {
COSString cosString = (COSString) arrElement;
String string = cosString.getString();
}
}
这会给我奇怪的家伙。
当我使用 windowx docx 文档(我在某处的电子邮件中找到)并使用在线转换器对其进行转换时,它也具有 TJ 元素并且效果很好(我看到了文本)。
我的问题是:不同的pdf有什么区别(pdf版本1.3/1.4,字体,编码,更多??)。更重要的是 pdfbox 理解什么或者我如何阅读其他文件?有一些元数据吗?我可以在 pdfbox 中设置编码/版本/字体类型吗?
谢谢,
提比
ps在这里我的完整(正在进行的代码不是那么好):
package nl.tibi.pdfboxhelper;
import java.io.IOException;
import java.io.OutputStream;
import java.util.List;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.exceptions.COSVisitorException;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.graphics.PDGraphicsState;
import org.apache.pdfbox.util.PDFOperator;
import org.apache.pdfbox.util.PDFTextStripper;
/**
* Hello world!
*/
public class PdfStringReplacer {
/**
* Locate a string in a PDF and replace it with a new string.
*
* @param inputFile
* The PDF to open.
* @param outputFile
* The PDF to write to.
* @param strToFind
* The string to find in the PDF document.
* @param message
* The message to write in the file.
* @throws IOException
* If there is an error writing the data.
* @throws COSVisitorException
* If there is an error writing the PDF.
*/
public void replaceString(String inputFile, String outputFile, String strToFind, String message) throws IOException, COSVisitorException {
// the document
PDDocument doc = null;
try {
doc = PDDocument.load(inputFile);
PDFTextStripper textStripper = new PDFTextStripper();
System.out.println(textStripper.getText(doc));
PDGraphicsState graphicsState = null;
PDDocumentInformation info = doc.getDocumentInformation();
System.out.println("Page Count=" + doc.getNumberOfPages());
System.out.println("Title=" + info.getTitle());
System.out.println("Author=" + info.getAuthor());
System.out.println("Subject=" + info.getSubject());
System.out.println("Keywords=" + info.getKeywords());
System.out.println("Creator=" + info.getCreator());
System.out.println("Producer=" + info.getProducer());
System.out.println("Creation Date=" + info.getCreationDate());
System.out.println("Modification Date=" + info.getModificationDate());
System.out.println("Trapped=" + info.getTrapped());
System.out.println("isNeedToBeUpdate=" + info.getMetadataKeys());
PDDocumentCatalog catalog = doc.getDocumentCatalog();
PDMetadata metadata = catalog.getMetadata();
List pages = doc.getDocumentCatalog().getAllPages();
for (int i = 0; i < pages.size(); i++) {
PDPage page = (PDPage) pages.get(i);
graphicsState = new PDGraphicsState(page.findCropBox());
PDFont font = graphicsState.getTextState().getFont();
if (font == null) {
font = new PDType1Font();
}
PDStream contents = page.getContents();
PDFStreamParser parser = new PDFStreamParser(contents.getStream());
parser.parse();
List tokens = parser.getTokens();
for (int j = 0; j < tokens.size(); j++) {
Object next = tokens.get(j);
if (next instanceof PDFOperator) {
PDFOperator op = (PDFOperator) next;
// Tj and TJ are the two operators that display
// strings in a PDF
if (op.getOperation().equals("Tj")) {
// Tj takes one operator and that is the string
// to display so lets update that operator
COSString previous = (COSString) tokens.get(j - 1);
String string = previous.getString(); // new String(previous.getBytes(), "ISO-8859-1");
string = string.replaceFirst(strToFind, message);
previous.reset();
System.out.println("p: " + string + " <=> " + previous.getHexString() + " <->" + new String(previous.getBytes(), "UTF-16BE")
+ " <->" + new String(previous.getBytes(), "ISO-8859-1"));
previous.append(string.getBytes("ISO-8859-1"));// "test".getBytes());
} else if (op.getOperation().equals("TJ")) {
COSArray previous = (COSArray) tokens.get(j - 1);
for (int k = 0; k < previous.size(); k++) {
Object arrElement = previous.getObject(k);
if (arrElement instanceof COSString) {
COSString cosString = (COSString) arrElement;
String string = cosString.getString();
String c = font.encode(cosString.getBytes(), 0, 1);
System.out.println(c + " " + string + " <=> " + cosString.getHexString() + " <->"
+ new String(cosString.getBytes(), "UTF-16BE") + " <->" + new String(cosString.getBytes(), "ISO-8859-1"));
string = string.replaceFirst(strToFind, message);
cosString.reset();
cosString.append(string.getBytes("ISO-8859-1"));
}
}
}
}
}
// now that the tokens are updated we will replace the
// page content stream.
PDStream updatedStream = new PDStream(doc);
OutputStream out = updatedStream.createOutputStream();
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
tokenWriter.writeTokens(tokens);
page.setContents(updatedStream);
}
doc.save(outputFile);
} finally {
if (doc != null) {
doc.close();
}
}
}
}