0

我正在使用Apache poi库使用以下 java 代码将doc 文件转换为 pdf :

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.OutputStream;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;

import com.lowagie.text.Document;
import com.lowagie.text.Paragraph;
import com.lowagie.text.pdf.PdfWriter;

public class TestDoc {

    /**
     * @param args
     */
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        POIFSFileSystem fs = null;
        Document document = new Document();
        try {
            System.out.println("Starting the test");

            //D:\vijay\doctopdf
            fs = new POIFSFileSystem(new FileInputStream("D:/vijay/doctopdf/test.doc"));

            HWPFDocument doc = new HWPFDocument(fs);
            WordExtractor we = new WordExtractor(doc);

            OutputStream file = new FileOutputStream(new File("D:/vijay/doctopdf/test.pdf"));

            PdfWriter writer = PdfWriter.getInstance(document, file);

            Range range = doc.getRange();
            document.open();
            writer.setPageEmpty(true);
            document.newPage();
            writer.setPageEmpty(true);

            String[] paragraphs = we.getParagraphText();
            for (int i = 0; i < paragraphs.length; i++) {

                org.apache.poi.hwpf.usermodel.Paragraph pr = range
                        .getParagraph(i);
                // CharacterRun run = pr.getCharacterRun(i);
                // run.setBold(true);
                // run.setCapitalized(true);
                // run.setItalic(true);
                paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
                System.out.println("Length:" + paragraphs[i].length());
                System.out.println("Paragraph" + i + ": "
                        + paragraphs[i].toString());

                // add the paragraph to the document
                document.add(new Paragraph(paragraphs[i]));
            }

            System.out.println("Document testing completed");
        } catch (Exception e) {
            System.out.println("Exception during test");
            e.printStackTrace();
        } finally {
            // close the document
            document.close();
        }
    }

}

上述代码运行成功(仅转换 pdf 中的文本)。但是当 doc 包含表格或图像等时,它不会出现在结果 pdf 中。任何人都知道我怎样才能获得具有完整准确性和格式的 pdf 文档。

4

1 回答 1

0

您可以使用 Apache Tika Parser 中的 WordExtractor

于 2014-05-27T14:14:36.773 回答