1

使用以下代码将 PDF 文件写入 HTML 文件格式时...

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Writer;

import javax.xml.parsers.ParserConfigurationException;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.fit.pdfdom.PDFDomTree;
import org.fit.pdfdom.PDFDomTreeConfig;
import org.fit.pdfdom.resource.HtmlResourceHandler;
import org.fit.pdfdom.resource.SaveResourceToDirHandler;

public class PdfToHtmlConverter {

    public String pdfToHtmlFileWriter(File file, String outputFilePath, String outputFileName) throws InvalidPasswordException, IOException, ParserConfigurationException {
        // load the PDF file using PDFBox
        PDDocument pdf = PDDocument.load(file);
        PDFDomTreeConfig config = PDFDomTreeConfig.createDefaultConfig();
        HtmlResourceHandler fontHandler = new SaveResourceToDirHandler();
        config.setFontHandler(fontHandler);

        HtmlResourceHandler imageHandler = new SaveResourceToDirHandler();
        config.setImageHandler(imageHandler);


        // create the DOM parser
        PDFDomTree parser = new PDFDomTree();
        // parse the file and get the DOM Document
        String outputFile = outputFilePath + File.separator + outputFileName + ".html";
        try (Writer woutput = new PrintWriter(new BufferedWriter(new FileWriter(outputFile)))) {
            parser.writeText(pdf, woutput);
        } catch(Exception e) {
            e.printStackTrace();
        }

        pdf.close();
        return outputFile;
    }
}

build.gradle文件具有以下依赖项列表...

dependencies {
    compile fileTree(dir: 'lib', include: ['*.jar'])
    compile group: 'org.apache.pdfbox',             name: 'pdfbox',         version: '2.0.6'
    compile group: 'org.apache.pdfbox',             name: 'pdfbox-tools',   version: '2.0.6'
    compile group: 'org.apache.logging.log4j',      name: 'log4j',          version: '2.11.0'
    compile group: 'org.apache.logging.log4j',      name: 'log4j-api',      version: '2.6.1'
    compile group: 'org.apache.logging.log4j',      name: 'log4j-core',     version: '2.6.1'
    compile group: 'javax.mail',                    name: 'mail',           version: '1.4.1'
    compile group: 'org.bouncycastle',              name: 'bcmail-jdk15',   version: '1.46' 
    compile group: 'org.bouncycastle',              name: 'bcprov-jdk15on', version: '1.47'
    compile group: 'net.sf.ehcache',                name: 'ehcache-core',   version: '2.4.6'
    compile group: 'com.google.guava',              name: 'guava',          version: '11.0.2'
    compile group: 'redis.clients',                 name: 'jedis',          version: '2.9.0'
    compile group: 'org.apache.poi',                name: 'poi-ooxml',      version: '3.17'
    compile group: 'org.apache.poi',                name: 'poi',            version: '3.17'
    compile group: 'net.sf.cssbox',                 name: 'pdf2dom',        version: '1.7'
    compile group: 'com.levigo.jbig2',              name: 'levigo-jbig2-imageio', version: '1.6.5'

    compile 'com.google.code.gson:gson:2.8.2'
    compile 'org.json:json:20180130'
}

哦,快!从 JDK 收到以下消息...

[org.glassfish.jersey.server.ContainerException: java.util.ServiceConfigurationError: com.levigo.jbig2.util.log.LoggerBridge: Provider com.levigo.jbig2.util.log.JDKLoggerBridge not a subtype] with root cause
java.util.ServiceConfigurationError: com.levigo.jbig2.util.log.LoggerBridge: Provider com.levigo.jbig2.util.log.JDKLoggerBridge not a subtype
    at java.util.ServiceLoader.fail(Unknown Source)
    at java.util.ServiceLoader.access$300(Unknown Source)
    at java.util.ServiceLoader$LazyIterator.nextService(Unknown Source)
    at java.util.ServiceLoader$LazyIterator.next(Unknown Source)
    at java.util.ServiceLoader$1.next(Unknown Source)
    at com.levigo.jbig2.util.log.LoggerFactory.getLogger(LoggerFactory.java:42)
    at com.levigo.jbig2.util.log.LoggerFactory.getLogger(LoggerFactory.java:48)
    at com.levigo.jbig2.JBIG2ImageReader.<clinit>(JBIG2ImageReader.java:45)
    at com.levigo.jbig2.JBIG2ImageReaderSpi.createReaderInstance(JBIG2ImageReaderSpi.java:116)
    at javax.imageio.spi.ImageReaderSpi.createReaderInstance(Unknown Source)
    at javax.imageio.ImageIO$ImageReaderIterator.next(Unknown Source)
    at javax.imageio.ImageIO$ImageReaderIterator.next(Unknown Source)
    at org.apache.pdfbox.filter.Filter.findImageReader(Filter.java:133)
    at org.apache.pdfbox.filter.JBIG2Filter.decode(JBIG2Filter.java:54)
    at org.apache.pdfbox.cos.COSInputStream.create(COSInputStream.java:69)
    at org.apache.pdfbox.cos.COSStream.createInputStream(COSStream.java:167)
    at org.apache.pdfbox.pdmodel.common.PDStream.createInputStream(PDStream.java:235)
    at org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject.<init>(PDImageXObject.java:125)
    at org.apache.pdfbox.pdmodel.graphics.PDXObject.createXObject(PDXObject.java:70)
    at org.apache.pdfbox.pdmodel.PDResources.getXObject(PDResources.java:409)
    at org.fit.pdfdom.PDFBoxTree.processFontResources(PDFBoxTree.java:397)
    at org.fit.pdfdom.PDFBoxTree.updateFontTable(PDFBoxTree.java:361)
    at org.fit.pdfdom.PDFDomTree.updateFontTable(PDFDomTree.java:544)
    at org.fit.pdfdom.PDFBoxTree.processPage(PDFBoxTree.java:206)
    at org.apache.pdfbox.text.PDFTextStripper.processPages(PDFTextStripper.java:319)
    at org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:266)
    at org.fit.pdfdom.PDFDomTree.createDOM(PDFDomTree.java:218)
    at org.fit.pdfdom.PDFDomTree.writeText(PDFDomTree.java:194)
    at com.pype.html.converter.PdfToHtmlConverter.pdfToHtmlFileWriter(PdfToHtmlConverter.java:91)
    at com.pype.drawings.slicing.VerticalSlicer.convertCompleteSinglePagePdftoHtml(VerticalSlicer.java:540)
    at com.pype.drawings.slicing.VerticalSlicer.convertCompletePdfPageToHtml(VerticalSlicer.java:104)
    at com.pype.pdf.schedules.extractor.ExtractSchedules.generateHtmlFiles(ExtractSchedules.java:344)
    at com.pype.pdf.schedules.extractor.ExtractSchedules.getIdentifiedSchedulesUsingElements(ExtractSchedules.java:218)
    at com.pype.solr.rest.api.ExtractPDFDrawing.processUploadedPDFFile(ExtractPDFDrawing.java:511)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
    at java.lang.reflect.Method.invoke(Unknown Source)
    at org.glassfish.jersey.server.model.internal.ResourceMethodInvocationHandlerFactory$1.invoke(ResourceMethodInvocationHandlerFactory.java:81)
    at org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher$1.run(AbstractJavaResourceMethodDispatcher.java:144)
    at org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher.invoke(AbstractJavaResourceMethodDispatcher.java:161)
    at org.glassfish.jersey.server.model.internal.JavaResourceMethodDispatcherProvider$TypeOutInvoker.doDispatch(JavaResourceMethodDispatcherProvider.java:205)
    at org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher.dispatch(AbstractJavaResourceMethodDispatcher.java:99)
    at org.glassfish.jersey.server.model.ResourceMethodInvoker.invoke(ResourceMethodInvoker.java:389)
    at org.glassfish.jersey.server.model.ResourceMethodInvoker.apply(ResourceMethodInvoker.java:347)
    at org.glassfish.jersey.server.model.ResourceMethodInvoker.apply(ResourceMethodInvoker.java:102)
    at org.glassfish.jersey.server.ServerRuntime$2.run(ServerRuntime.java:326)
    at org.glassfish.jersey.internal.Errors$1.call(Errors.java:271)
    at org.glassfish.jersey.internal.Errors$1.call(Errors.java:267)
    at org.glassfish.jersey.internal.Errors.process(Errors.java:315)
    at org.glassfish.jersey.internal.Errors.process(Errors.java:297)
    at org.glassfish.jersey.internal.Errors.process(Errors.java:267)
    at org.glassfish.jersey.process.internal.RequestScope.runInScope(RequestScope.java:317)
    at org.glassfish.jersey.server.ServerRuntime.process(ServerRuntime.java:305)
    at org.glassfish.jersey.server.ApplicationHandler.handle(ApplicationHandler.java:1154)
    at org.glassfish.jersey.servlet.WebComponent.serviceImpl(WebComponent.java:473)
    at org.glassfish.jersey.servlet.WebComponent.service(WebComponent.java:427)
    at org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:388)
    at org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:341)
    at org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:228)
    at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:231)
    at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:166)
    at org.apache.tomcat.websocket.server.WsFilter.doFilter(WsFilter.java:53)
    at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:193)
    at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:166)
    at org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:199)
    at org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:96)
    at org.apache.catalina.authenticator.AuthenticatorBase.invoke(AuthenticatorBase.java:502)
    at org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:140)
    at org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:81)
    at org.apache.catalina.valves.AbstractAccessLogValve.invoke(AbstractAccessLogValve.java:651)
    at org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:87)
    at org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:342)
    at org.apache.coyote.http11.Http11Processor.service(Http11Processor.java:501)
    at org.apache.coyote.AbstractProcessorLight.process(AbstractProcessorLight.java:66)
    at org.apache.coyote.AbstractProtocol$ConnectionHandler.process(AbstractProtocol.java:754)
    at org.apache.tomcat.util.net.NioEndpoint$SocketProcessor.doRun(NioEndpoint.java:1376)
    at org.apache.tomcat.util.net.SocketProcessorBase.run(SocketProcessorBase.java:49)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
    at org.apache.tomcat.util.threads.TaskThread$WrappingRunnable.run(TaskThread.java:61)
    at java.lang.Thread.run(Unknown Source)

在搜索了有关此错误的更多信息后,没有任何线索。如果有人对此有任何想法,请对此提出一些建议。

谢谢

4

1 回答 1

1

请更新到最新版本的 jbig2 解码器,即 3.0.2。由于levigo Solutions GmbH ,jbig2 解码器现在是 Apache PDFBox 的一部分。对于 Maven,使用这个:

    <dependency>
        <groupId>org.apache.pdfbox</groupId>
        <artifactId>jbig2-imageio</artifactId>
        <version>3.0.2</version>
    </dependency>

或者使用直接下载

于 2018-10-27T10:22:15.493 回答