1

我在我的 Java EE 项目(在 Apache Tomcat 7.0.32 上运行)中使用 Apache POI 库将 .doc 文件转换为 .html 文件。

我将org.apache.poi.hwpf.converter包中的WordToHtmlConverter.java类作为基础。我唯一改变的是 - 我在构造函数中传递输出流的文件(而不是在main()方法中)。

当我运行我的应用程序并上传 .doc 文件(大小 < 500KB)时,我看到 java.exe 进程增加了 10-15 MB。当我上传另一个 .doc 文件时,java.exe 再次增加了 2-3 MB。所以每次我上传 .doc 文件时,java.exe 都会增加 2-3 MB。如果我上传大的 .doc 文件(大小 > 3MB),java.exe 会再次增加 20-30 MB。所以它一直持续到java.lang.OutOfMemoryError: Java heap space

为什么会这样?为什么每次上传 .doc 文件时 java.exe 进程会增加 2-3 MB?我应该怎么做才能避免出现java.lang.OutOfMemoryError

这是我在我的应用程序中使用的 WordToHtmlConverter.java 的源代码:

package ru.emzior.view.convertors;

import java.io.Writer;
import java.io.OutputStreamWriter;
import java.io.FileOutputStream;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.converter.HtmlDocumentFacade;
import org.apache.poi.hwpf.converter.AbstractWordConverter;
import java.io.File;
import java.io.FileWriter;
import java.util.List;
import java.util.Stack;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.converter.FontReplacer.Triplet;
import org.apache.poi.hwpf.usermodel.Bookmark;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.OfficeDrawing;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Section;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableRow;
import org.apache.poi.util.Beta;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Text;

import static org.apache.poi.hwpf.converter.AbstractWordUtils.TWIPS_PER_INCH;

/**
 * Converts Word files (95-2007) into HTML files.
 * <p>
 * This implementation doesn't create images or links to them. This can be
 * changed by overriding {@link #processImage(Element, boolean, Picture)}
 * method.
 * 
 * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
 */
@Beta
public class DocToHtmlConverter extends AbstractWordConverter implements HtmlConverter
{

    private String filePath;
    private String tempDirectory;

    private static class BlockProperies
    {
        final String pFontName;
        final int pFontSize;

        public BlockProperies( String pFontName, int pFontSize )
        {
            this.pFontName = pFontName;
            this.pFontSize = pFontSize;
        }
    }

    public DocToHtmlConverter(String filePath, String tempDirectory) {
        this.filePath = filePath;
        this.htmlDocumentFacade = null;
        this.tempDirectory = tempDirectory;
    }

    private static final POILogger logger = POILogFactory
            .getLogger( DocToHtmlConverter.class );

    private static String getSectionStyle( Section section )
    {
        float leftMargin = section.getMarginLeft() / TWIPS_PER_INCH;
        float rightMargin = section.getMarginRight() / TWIPS_PER_INCH;
        float topMargin = section.getMarginTop() / TWIPS_PER_INCH;
        float bottomMargin = section.getMarginBottom() / TWIPS_PER_INCH;

        String style = "margin: " + topMargin + "in " + rightMargin + "in "
                + bottomMargin + "in " + leftMargin + "in;";

        if ( section.getNumColumns() > 1 )
        {
            style += "column-count: " + ( section.getNumColumns() ) + ";";
            if ( section.isColumnsEvenlySpaced() )
            {
                float distance = section.getDistanceBetweenColumns()
                        / TWIPS_PER_INCH;
                style += "column-gap: " + distance + "in;";
            }
            else
            {
                style += "column-gap: 0.25in;";
            }
        }
        return style;
    }


    public File convertToFile(){
        String slashType = (filePath.lastIndexOf("\\") > 0) ? "\\" : "/"; // Windows or UNIX
        String wordFileName = filePath.substring(filePath.lastIndexOf(slashType) + 1, filePath.length());
        File htmlFile = new File(tempDirectory + wordFileName + ".htm");

        if(!htmlFile.exists()){
            try
            {
                Document doc = DocToHtmlConverter.process( new File(filePath) );

//                FileWriter out = new FileWriter(htmlFile);
                Writer out = new OutputStreamWriter(new FileOutputStream(htmlFile), "UTF-8");
                DOMSource domSource = new DOMSource( doc );
                StreamResult streamResult = new StreamResult( out );

                TransformerFactory tf = TransformerFactory.newInstance();
                Transformer serializer = tf.newTransformer();
                // TODO set encoding from a command argument
                serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
                serializer.setOutputProperty( OutputKeys.INDENT, "yes" );
                serializer.setOutputProperty( OutputKeys.METHOD, "html" );
                serializer.transform( domSource, streamResult );

                out.flush();
                out.close();
            }
            catch ( Exception e )
            {
                e.printStackTrace();
            }
        }

        return htmlFile;
    }

    static Document process( File docFile ) throws Exception
    {
        final HWPFDocumentCore wordDocument = WordToHtmlUtils.loadDoc( docFile );
        DocToHtmlConverter wordToHtmlConverter = new DocToHtmlConverter(
                DocumentBuilderFactory.newInstance().newDocumentBuilder()
                        .newDocument() );
        wordToHtmlConverter.processDocument( wordDocument );
        return wordToHtmlConverter.getDocument();
    }

    private final Stack<BlockProperies> blocksProperies = new Stack<BlockProperies>();

    private final HtmlDocumentFacade htmlDocumentFacade;

    private Element notes = null;


    public DocToHtmlConverter( Document document )
    {
        this.htmlDocumentFacade = new HtmlDocumentFacade( document );
    }

    public DocToHtmlConverter( HtmlDocumentFacade htmlDocumentFacade )
    {
        this.htmlDocumentFacade = htmlDocumentFacade;
    }

    @Override
    protected void afterProcess()
    {
        if ( notes != null )
            htmlDocumentFacade.getBody().appendChild( notes );

        htmlDocumentFacade.updateStylesheet();
    }

    public Document getDocument()
    {
        return htmlDocumentFacade.getDocument();
    }

    @Override
    protected void outputCharacters( Element pElement,
            CharacterRun characterRun, String text )
    {
        Element span = htmlDocumentFacade.getDocument().createElement( "span" );
        pElement.appendChild( span );

        StringBuilder style = new StringBuilder();
        BlockProperies blockProperies = this.blocksProperies.peek();
        Triplet triplet = getCharacterRunTriplet( characterRun );

        if (WordToHtmlUtils.isNotEmpty( triplet.fontName )
                && !WordToHtmlUtils.equals( triplet.fontName,
                        blockProperies.pFontName ) )
        {
            style.append( "font-family:" + triplet.fontName + ";" );
        }
        if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize )
        {
            style.append( "font-size:" + characterRun.getFontSize() / 2 + "pt;" );
        }
        if ( triplet.bold )
        {
            style.append( "font-weight:bold;" );
        }
        if ( triplet.italic )
        {
            style.append( "font-style:italic;" );
        }

        WordToHtmlUtils.addCharactersProperties( characterRun, style );
        if ( style.length() != 0 )
            htmlDocumentFacade.addStyleClass( span, "s", style.toString() );

        Text textNode = htmlDocumentFacade.createText( text );
        span.appendChild( textNode );
    }

    @Override
    protected void processBookmarks( HWPFDocumentCore wordDocument,
            Element currentBlock, Range range, int currentTableLevel,
            List<Bookmark> rangeBookmarks )
    {
        Element parent = currentBlock;
        for ( Bookmark bookmark : rangeBookmarks )
        {
            Element bookmarkElement = htmlDocumentFacade
                    .createBookmark( bookmark.getName() );
            parent.appendChild( bookmarkElement );
            parent = bookmarkElement;
        }

        if ( range != null )
            processCharacters( wordDocument, currentTableLevel, range, parent );
    }

    @Override
    protected void processDocumentInformation(
            SummaryInformation summaryInformation )
    {
        if (WordToHtmlUtils.isNotEmpty( summaryInformation.getTitle() ) )
            htmlDocumentFacade.setTitle( summaryInformation.getTitle() );

        if (WordToHtmlUtils.isNotEmpty( summaryInformation.getAuthor() ) )
            htmlDocumentFacade.addAuthor( summaryInformation.getAuthor() );

        if (WordToHtmlUtils.isNotEmpty( summaryInformation.getKeywords() ) )
            htmlDocumentFacade.addKeywords( summaryInformation.getKeywords() );

        if (WordToHtmlUtils.isNotEmpty( summaryInformation.getComments() ) )
            htmlDocumentFacade
                    .addDescription( summaryInformation.getComments() );
    }

    @Override
    public void processDocumentPart( HWPFDocumentCore wordDocument, Range range )
    {
        super.processDocumentPart( wordDocument, range );
        afterProcess();
    }

    @Override
    protected void processDrawnObject( HWPFDocument doc,
            CharacterRun characterRun, OfficeDrawing officeDrawing,
            String path, Element block )
    {
        Element img = htmlDocumentFacade.createImage( path );
        block.appendChild( img );
    }

    @Override
    protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
            int noteIndex, Element block, Range endnoteTextRange )
    {
        processNoteAutonumbered( wordDocument, "end", noteIndex, block,
                endnoteTextRange );
    }

    @Override
    protected void processFootnoteAutonumbered( HWPFDocument wordDocument,
            int noteIndex, Element block, Range footnoteTextRange )
    {
        processNoteAutonumbered( wordDocument, "foot", noteIndex, block,
                footnoteTextRange );
    }

    @Override
    protected void processHyperlink( HWPFDocumentCore wordDocument,
            Element currentBlock, Range textRange, int currentTableLevel,
            String hyperlink )
    {
        Element basicLink = htmlDocumentFacade.createHyperlink( hyperlink );
        currentBlock.appendChild( basicLink );

        if ( textRange != null )
            processCharacters( wordDocument, currentTableLevel, textRange,
                    basicLink );
    }

    protected void processImage( Element currentBlock, boolean inlined,
            Picture picture, String imageSourcePath )
    {
        final int aspectRatioX = picture.getHorizontalScalingFactor();
        final int aspectRatioY = picture.getVerticalScalingFactor();

        StringBuilder style = new StringBuilder();

        final float imageWidth;
        final float imageHeight;

        final float cropTop;
        final float cropBottom;
        final float cropLeft;
        final float cropRight;

        if ( aspectRatioX > 0 )
        {
            imageWidth = picture.getDxaGoal() * aspectRatioX / 1000
                    / TWIPS_PER_INCH;
            cropRight = picture.getDxaCropRight() * aspectRatioX / 1000
                    / TWIPS_PER_INCH;
            cropLeft = picture.getDxaCropLeft() * aspectRatioX / 1000
                    / TWIPS_PER_INCH;
        }
        else
        {
            imageWidth = picture.getDxaGoal() / TWIPS_PER_INCH;
            cropRight = picture.getDxaCropRight() / TWIPS_PER_INCH;
            cropLeft = picture.getDxaCropLeft() / TWIPS_PER_INCH;
        }

        if ( aspectRatioY > 0 )
        {
            imageHeight = picture.getDyaGoal() * aspectRatioY / 1000
                    / TWIPS_PER_INCH;
            cropTop = picture.getDyaCropTop() * aspectRatioY / 1000
                    / TWIPS_PER_INCH;
            cropBottom = picture.getDyaCropBottom() * aspectRatioY / 1000
                    / TWIPS_PER_INCH;
        }
        else
        {
            imageHeight = picture.getDyaGoal() / TWIPS_PER_INCH;
            cropTop = picture.getDyaCropTop() / TWIPS_PER_INCH;
            cropBottom = picture.getDyaCropBottom() / TWIPS_PER_INCH;
        }

        Element root;
        if ( cropTop != 0 || cropRight != 0 || cropBottom != 0 || cropLeft != 0 )
        {
            float visibleWidth = Math
                    .max( 0, imageWidth - cropLeft - cropRight );
            float visibleHeight = Math.max( 0, imageHeight - cropTop
                    - cropBottom );

            root = htmlDocumentFacade.createBlock();
            htmlDocumentFacade.addStyleClass( root, "d",
                    "vertical-align:text-bottom;width:" + visibleWidth
                            + "in;height:" + visibleHeight + "in;" );

            // complex
            Element inner = htmlDocumentFacade.createBlock();
            htmlDocumentFacade.addStyleClass( inner, "d",
                    "position:relative;width:" + visibleWidth + "in;height:"
                            + visibleHeight + "in;overflow:hidden;" );
            root.appendChild( inner );

            Element image = htmlDocumentFacade.createImage( imageSourcePath );
            htmlDocumentFacade.addStyleClass( image, "i",
                    "position:absolute;left:-" + cropLeft + ";top:-" + cropTop
                            + ";width:" + imageWidth + "in;height:"
                            + imageHeight + "in;" );
            inner.appendChild( image );

            style.append( "overflow:hidden;" );
        }
        else
        {
            root = htmlDocumentFacade.createImage( imageSourcePath );
            root.setAttribute( "style", "width:" + imageWidth + "in;height:"
                    + imageHeight + "in;vertical-align:text-bottom;" );
        }

        currentBlock.appendChild( root );
    }

    @Override
    protected void processImageWithoutPicturesManager( Element currentBlock,
            boolean inlined, Picture picture )
    {
        // no default implementation -- skip
        currentBlock.appendChild( htmlDocumentFacade.getDocument()
                .createComment( "Image link to '"
                        + picture.suggestFullFileName() + "' can be here" ) );
    }

    @Override
    protected void processLineBreak( Element block, CharacterRun characterRun )
    {
        block.appendChild( htmlDocumentFacade.createLineBreak() );
    }

    protected void processNoteAutonumbered( HWPFDocument doc, String type,
            int noteIndex, Element block, Range noteTextRange )
    {
        final String textIndex = String.valueOf( noteIndex + 1 );
        final String textIndexClass = htmlDocumentFacade.getOrCreateCssClass(
                "a", "vertical-align:super;font-size:smaller;" );
        final String forwardNoteLink = type + "note_" + textIndex;
        final String backwardNoteLink = type + "note_back_" + textIndex;

        Element anchor = htmlDocumentFacade.createHyperlink( "#"
                + forwardNoteLink );
        anchor.setAttribute( "name", backwardNoteLink );
        anchor.setAttribute( "class", textIndexClass + " " + type
                + "noteanchor" );
        anchor.setTextContent( textIndex );
        block.appendChild( anchor );

        if ( notes == null )
        {
            notes = htmlDocumentFacade.createBlock();
            notes.setAttribute( "class", "notes" );
        }

        Element note = htmlDocumentFacade.createBlock();
        note.setAttribute( "class", type + "note" );
        notes.appendChild( note );

        Element bookmark = htmlDocumentFacade.createBookmark( forwardNoteLink );
        bookmark.setAttribute( "href", "#" + backwardNoteLink );
        bookmark.setTextContent( textIndex );
        bookmark.setAttribute( "class", textIndexClass + " " + type
                + "noteindex" );
        note.appendChild( bookmark );
        note.appendChild( htmlDocumentFacade.createText( " " ) );

        Element span = htmlDocumentFacade.getDocument().createElement( "span" );
        span.setAttribute( "class", type + "notetext" );
        note.appendChild( span );

        this.blocksProperies.add( new BlockProperies( "", -1 ) );
        try
        {
            processCharacters( doc, Integer.MIN_VALUE, noteTextRange, span );
        }
        finally
        {
            this.blocksProperies.pop();
        }
    }

    @Override
    protected void processPageBreak( HWPFDocumentCore wordDocument, Element flow )
    {
        flow.appendChild( htmlDocumentFacade.createLineBreak() );
    }

    protected void processPageref( HWPFDocumentCore hwpfDocument,
            Element currentBlock, Range textRange, int currentTableLevel,
            String pageref )
    {
        Element basicLink = htmlDocumentFacade.createHyperlink( "#" + pageref );
        currentBlock.appendChild( basicLink );

        if ( textRange != null )
            processCharacters( hwpfDocument, currentTableLevel, textRange,
                    basicLink );
    }

    protected void processParagraph( HWPFDocumentCore hwpfDocument,
            Element parentElement, int currentTableLevel, Paragraph paragraph,
            String bulletText )
    {
        final Element pElement = htmlDocumentFacade.createParagraph();
        parentElement.appendChild( pElement );

        StringBuilder style = new StringBuilder();
        WordToHtmlUtils.addParagraphProperties( paragraph, style );

        final int charRuns = paragraph.numCharacterRuns();

        if ( charRuns == 0 )
        {
            return;
        }

        {
            final String pFontName;
            final int pFontSize;
            final CharacterRun characterRun = paragraph.getCharacterRun( 0 );
            if ( characterRun != null )
            {
                Triplet triplet = getCharacterRunTriplet( characterRun );
                pFontSize = characterRun.getFontSize() / 2;
                pFontName = triplet.fontName;
                WordToHtmlUtils.addFontFamily( pFontName, style );
                WordToHtmlUtils.addFontSize( pFontSize, style );
            }
            else
            {
                pFontSize = -1;
                pFontName = "";
            }
            blocksProperies.push( new BlockProperies( pFontName, pFontSize ) );
        }
        try
        {
            if (WordToHtmlUtils.isNotEmpty( bulletText ) )
            {
                if ( bulletText.endsWith( "\t" ) )
                {
                    /*
                     * We don't know how to handle all cases in HTML, but at
                     * least simplest case shall be handled
                     */
                    final float defaultTab = TWIPS_PER_INCH / 2;
                    float firstLinePosition = paragraph.getIndentFromLeft()
                            + paragraph.getFirstLineIndent() + 20; // char have
                                                                   // some space

                    float nextStop = (float) ( Math.ceil( firstLinePosition
                            / defaultTab ) * defaultTab );

                    final float spanMinWidth = nextStop - firstLinePosition;

                    Element span = htmlDocumentFacade.getDocument()
                            .createElement( "span" );
                    htmlDocumentFacade
                            .addStyleClass( span, "s",
                                    "display: inline-block; text-indent: 0; min-width: "
                                            + ( spanMinWidth / TWIPS_PER_INCH )
                                            + "in;" );
                    pElement.appendChild( span );

                    Text textNode = htmlDocumentFacade.createText( bulletText
                            .substring( 0, bulletText.length() - 1 )
                            + UNICODECHAR_ZERO_WIDTH_SPACE
                            + UNICODECHAR_NO_BREAK_SPACE );
                    span.appendChild( textNode );
                }
                else
                {
                    Text textNode = htmlDocumentFacade.createText( bulletText
                            .substring( 0, bulletText.length() - 1 ) );
                    pElement.appendChild( textNode );
                }
            }

            processCharacters( hwpfDocument, currentTableLevel, paragraph,
                    pElement );
        }
        finally
        {
            blocksProperies.pop();
        }

        if ( style.length() > 0 )
            htmlDocumentFacade.addStyleClass( pElement, "p", style.toString() );

        WordToHtmlUtils.compactSpans( pElement );
        return;
    }

    protected void processSection( HWPFDocumentCore wordDocument,
            Section section, int sectionCounter )
    {
        Element div = htmlDocumentFacade.createBlock();
        htmlDocumentFacade.addStyleClass( div, "d", getSectionStyle( section ) );
        htmlDocumentFacade.getBody().appendChild( div );

        processParagraphes( wordDocument, div, section, Integer.MIN_VALUE );
    }

    @Override
    protected void processSingleSection( HWPFDocumentCore wordDocument,
            Section section )
    {
        htmlDocumentFacade.addStyleClass( htmlDocumentFacade.getBody(), "b",
                getSectionStyle( section ) );

        processParagraphes( wordDocument, htmlDocumentFacade.getBody(), section,
                Integer.MIN_VALUE );
    }

}

这是开始转换文件的方法:

public File convertToFile(){
        String slashType = (filePath.lastIndexOf("\\") > 0) ? "\\" : "/"; // Windows or UNIX
        String wordFileName = filePath.substring(filePath.lastIndexOf(slashType) + 1, filePath.length());
        File htmlFile = new File(tempDirectory + wordFileName + ".htm");

        if(!htmlFile.exists()){
            try
            {
                Document doc = DocToHtmlConverter.process( new File(filePath) );

//                FileWriter out = new FileWriter(htmlFile);
                Writer out = new OutputStreamWriter(new FileOutputStream(htmlFile), "UTF-8");
                DOMSource domSource = new DOMSource( doc );
                StreamResult streamResult = new StreamResult( out );

                TransformerFactory tf = TransformerFactory.newInstance();
                Transformer serializer = tf.newTransformer();
                // TODO set encoding from a command argument
                serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
                serializer.setOutputProperty( OutputKeys.INDENT, "yes" );
                serializer.setOutputProperty( OutputKeys.METHOD, "html" );
                serializer.transform( domSource, streamResult );

                out.flush();
                out.close();
            }
            catch ( Exception e )
            {
                e.printStackTrace();
            }
        }

        return htmlFile;
    }
4

0 回答 0