java - 展平pdf内的矢量图形并使用java提取

Question

我正在尝试获取嵌入在 PDF 文件中的图像的大小（宽度和深度）。PDF 中的图像均为高分辨率矢量图像。

我尝试使用 PDFBox。PDFBox 库完美地为普通图形提取图像。但是，当它获取矢量图像时，它会将不同的层提取为不同的图像。
我还阅读了有关 iText 的信息。但是 iText 可以将整个页面转换为光栅化图像。然而，我的 PDF 页面实际上包含多个图像，我需要以不同的方式提取/获取所有图像的大小。

我在这里附上我的 PDFBox 图像提取代码。请让我知道，我怎样才能将一张矢量图像作为一张图像而不是作为图层。

我的代码如下：

package com.abp.pdf.util;

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectForm;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;

public class ExtractImages {

    private int imageCounter = 1;


    private ExtractImages() {
    }
    public static void main(String[] args) throws Exception {
        ExtractImages extractor = new ExtractImages();
        extractor.extractImages(args);
    }

    private void extractImages(String[] args) throws Exception {
        String pdfFile = null;
        String password = "";
        String prefix = null;
        boolean addKey = false;
        boolean useNonSeqParser = true;

        pdfFile = "/home/suvankar/Resources/myfile.pdf";
        if (prefix == null && pdfFile.length() > 4) {
            prefix = pdfFile.substring(0, pdfFile.lastIndexOf("/") + 1)
                    + "extracted/images"
                    + pdfFile.substring(pdfFile.lastIndexOf("/"),
                            pdfFile.length() - 4);
        }
        PDDocument document = null;

        try {
            if (useNonSeqParser) {
                document = PDDocument.loadNonSeq(new File(pdfFile), null,
                        password);
            } else {
                document = PDDocument.load(pdfFile);

                if (document.isEncrypted()) {
                    StandardDecryptionMaterial spm = new StandardDecryptionMaterial(
                            password);
                    document.openProtection(spm);
                }
            }
            AccessPermission ap = document.getCurrentAccessPermission();
            if (!ap.canExtractContent()) {
                throw new IOException(
                        "Error: You do not have permission to extract images.");
            }

            List pages = document.getDocumentCatalog().getAllPages();
            Iterator iter = pages.iterator();
            while (iter.hasNext()) {
                PDPage page = (PDPage) iter.next();
                PDResources resources = page.getResources();
                processResources(resources, prefix, addKey);
            }
        } finally {
            if (document != null) {
                document.close();
            }
        }
    }


    private void processResources(PDResources resources, String prefix,
            boolean addKey) throws IOException {
        if (resources == null) {
            return;
        }
        Map<String, PDXObject> xobjects = resources.getXObjects();
        if (xobjects != null) {
            Iterator<String> xobjectIter = xobjects.keySet().iterator();
            while (xobjectIter.hasNext()) {
                String key = xobjectIter.next();
                PDXObject xobject = xobjects.get(key);
                // write the images
                if (xobject instanceof PDXObjectImage) {
                    PDXObjectImage image = (PDXObjectImage) xobject;
                    String name = null;

                    if (addKey) {
                        name = getUniqueFileName(prefix + "_" + key,
                                image.getSuffix());
                    } else {
                        name = getUniqueFileName(prefix, image.getSuffix());
                    }
                    System.out.println("Writing image:" + name + "\nHeight - "+ image.getHeight() + "\nWidth - " + image.getWidth());
                    // name="extracted/images/" + name;
                    /*BufferedImage ib= image.getRGBImage();
                    File outputfile = new File(name + "-buffered.jpg");
                    ImageIO.write(ib, "jpeg", outputfile);*/ 
                    image.write2file(name);
                }
                // maybe there are more images embedded in a form object
                else if (xobject instanceof PDXObjectForm) {
                    PDXObjectForm xObjectForm = (PDXObjectForm) xobject;
                    PDResources formResources = xObjectForm.getResources();
                    processResources(formResources, prefix, addKey);
                }
            }
        }
    }

    private String getUniqueFileName(String prefix, String suffix) {
        String uniqueName = null;
        File f = null;
        while (f == null || f.exists()) {
            uniqueName = prefix + "-" + imageCounter;
            f = new File(uniqueName + "." + suffix);
            imageCounter++;
        }
        return uniqueName;
    }

    /**
     * This will print the usage requirements and exit.
     */
    private static void usage() {
        System.err
                .println("Usage: java org.apache.pdfbox.ExtractImages [OPTIONS] <PDF file>\n"
                        + "  -password  <password>        Password to decrypt document\n"
                        + "  -prefix  <image-prefix>      Image prefix(default to pdf name)\n"
                        + "  -addkey                      add the internal image key to the file name\n"
                        + "  -nonSeq                      Enables the new non-sequential parser\n"
                        + "  <PDF file>                   The PDF document to use\n");
        System.exit(1);
    }

}

java - 展平pdf内的矢量图形并使用java提取

0 回答 0

Related

Reference