java - Read multiple PDF files inside a folder using iText

Question

I am getting problem to read PDF files using iText in java. I know the way to read a page in a PDF file. Now, I want to read multiple PDF files from a folder.

How can I achieve this task?

score 1 · Accepted Answer

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;

import part1.chapter01.HelloWorld;

import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.io.RandomAccessSourceFactory;
import com.itextpdf.text.pdf.BaseFont;
import com.itextpdf.text.pdf.PRTokeniser;
import com.itextpdf.text.pdf.PdfContentByte;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfTemplate;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.text.pdf.RandomAccessFileOrArray;
import com.itextpdf.text.pdf.parser.ContentByteUtils;
import com.itextpdf.text.pdf.parser.PdfContentStreamProcessor;
import com.itextpdf.text.pdf.parser.RenderListener;

public class ParsingHelloWorld {

    /** The resulting PDF. */
    public static final String PDF = "results/part4/chapter15/hello_reverse.pdf";
    /** A possible resulting after parsing the PDF. */
    public static final String TEXT1 = "results/part4/chapter15/result1.txt";
    /** A possible resulting after parsing the PDF. */
    public static final String TEXT2 = "results/part4/chapter15/result2.txt";
    /** A possible resulting after parsing the PDF. */
    public static final String TEXT3 = "results/part4/chapter15/result3.txt";

    /**
     * Generates a PDF file with the text 'Hello World'
     * @throws DocumentException 
     * @throws IOException 
     */
    public void createPdf(String filename) throws DocumentException, IOException {
        // step 1
        Document document = new Document();
        // step 2
        PdfWriter writer
          = PdfWriter.getInstance(document, new FileOutputStream(filename));
        // step 3
        document.open();
        // step 4
        // we add the text to the direct content, but not in the right order
        PdfContentByte cb = writer.getDirectContent();
        BaseFont bf = BaseFont.createFont();
        cb.beginText();
        cb.setFontAndSize(bf, 12);
        cb.moveText(88.66f, 367); 
        cb.showText("ld");
        cb.moveText(-22f, 0); 
        cb.showText("Wor");
        cb.moveText(-15.33f, 0); 
        cb.showText("llo");
        cb.moveText(-15.33f, 0); 
        cb.showText("He");
        cb.endText();
        // we also add text in a form XObject
        PdfTemplate tmp = cb.createTemplate(250, 25);
        tmp.beginText();
        tmp.setFontAndSize(bf, 12);
        tmp.moveText(0, 7);
        tmp.showText("Hello People");
        tmp.endText();
        cb.addTemplate(tmp, 36, 343);
        // step 5
        document.close();
    }

    /**
     * Parses the PDF using PRTokeniser
     * @param src  the path to the original PDF file
     * @param dest the path to the resulting text file
     * @throws IOException
     */
    public void parsePdf(String src, String dest) throws IOException {
        PdfReader reader = new PdfReader(src);
        // we can inspect the syntax of the imported page
        byte[] streamBytes = reader.getPageContent(1);
        PRTokeniser tokenizer = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().createSource(streamBytes)));
        PrintWriter out = new PrintWriter(new FileOutputStream(dest));
        while (tokenizer.nextToken()) {
            if (tokenizer.getTokenType() == PRTokeniser.TokenType.STRING) {
                out.println(tokenizer.getStringValue());
            }
        }
        out.flush();
        out.close();
        reader.close();
    }

    /**
     * Extracts text from a PDF document.
     * @param src  the original PDF document
     * @param dest the resulting text file
     * @throws IOException
     */
    public void extractText(String src, String dest) throws IOException {
        PrintWriter out = new PrintWriter(new FileOutputStream(dest));
        PdfReader reader = new PdfReader(src);
        RenderListener listener = new MyTextRenderListener(out);
        PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener);
        PdfDictionary pageDic = reader.getPageN(1);
        PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);
        processor.processContent(ContentByteUtils.getContentBytesForPage(reader, 1), resourcesDic);
        out.flush();
        out.close();
        reader.close();
    }

    /**
     * Main method.
     * @param    args    no arguments needed
     * @throws DocumentException 
     * @throws IOException
     */
    public static void main(String[] args) throws DocumentException, IOException {
        ParsingHelloWorld example = new ParsingHelloWorld();
        HelloWorld.main(args);
        example.createPdf(PDF);
        example.parsePdf(HelloWorld.RESULT, TEXT1);
        example.parsePdf(PDF, TEXT2);
        example.extractText(PDF, TEXT3);
    }
}

取自iTextPDF网站。这向您展示了如何阅读单个PDF。如果您想从一个文件夹中读取多个 PDF，您需要一个DirectoryStream<Path>.

DirectoryStream<Path> allPDF = Files.newDirectoryStream(Paths.get("/path/to/folder"),"*.pdf");

现在，仅包含与文件夹中的 PDF 文件相对应的 DirectoryStream那些对象。Path

遍历DirectoryStream，获取Path对象的绝对路径，然后做任何你想做的事情。

java - Read multiple PDF files inside a folder using iText

1 回答 1

Related

Reference