我在我的 Java EE 项目(在 Apache Tomcat 7.0.32 上运行)中使用 Apache POI 库将 .doc 文件转换为 .html 文件。
我将org.apache.poi.hwpf.converter包中的WordToHtmlConverter.java类作为基础。我唯一改变的是 - 我在构造函数中传递输出流的文件(而不是在main()方法中)。
当我运行我的应用程序并上传 .doc 文件(大小 < 500KB)时,我看到 java.exe 进程增加了 10-15 MB。当我上传另一个 .doc 文件时,java.exe 再次增加了 2-3 MB。所以每次我上传 .doc 文件时,java.exe 都会增加 2-3 MB。如果我上传大的 .doc 文件(大小 > 3MB),java.exe 会再次增加 20-30 MB。所以它一直持续到java.lang.OutOfMemoryError: Java heap space。
为什么会这样?为什么每次上传 .doc 文件时 java.exe 进程会增加 2-3 MB?我应该怎么做才能避免出现java.lang.OutOfMemoryError?
这是我在我的应用程序中使用的 WordToHtmlConverter.java 的源代码:
package ru.emzior.view.convertors;
import java.io.Writer;
import java.io.OutputStreamWriter;
import java.io.FileOutputStream;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.converter.HtmlDocumentFacade;
import org.apache.poi.hwpf.converter.AbstractWordConverter;
import java.io.File;
import java.io.FileWriter;
import java.util.List;
import java.util.Stack;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.converter.FontReplacer.Triplet;
import org.apache.poi.hwpf.usermodel.Bookmark;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.OfficeDrawing;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Section;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableRow;
import org.apache.poi.util.Beta;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Text;
import static org.apache.poi.hwpf.converter.AbstractWordUtils.TWIPS_PER_INCH;
/**
* Converts Word files (95-2007) into HTML files.
* <p>
* This implementation doesn't create images or links to them. This can be
* changed by overriding {@link #processImage(Element, boolean, Picture)}
* method.
*
* @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
*/
@Beta
public class DocToHtmlConverter extends AbstractWordConverter implements HtmlConverter
{
private String filePath;
private String tempDirectory;
private static class BlockProperies
{
final String pFontName;
final int pFontSize;
public BlockProperies( String pFontName, int pFontSize )
{
this.pFontName = pFontName;
this.pFontSize = pFontSize;
}
}
public DocToHtmlConverter(String filePath, String tempDirectory) {
this.filePath = filePath;
this.htmlDocumentFacade = null;
this.tempDirectory = tempDirectory;
}
private static final POILogger logger = POILogFactory
.getLogger( DocToHtmlConverter.class );
private static String getSectionStyle( Section section )
{
float leftMargin = section.getMarginLeft() / TWIPS_PER_INCH;
float rightMargin = section.getMarginRight() / TWIPS_PER_INCH;
float topMargin = section.getMarginTop() / TWIPS_PER_INCH;
float bottomMargin = section.getMarginBottom() / TWIPS_PER_INCH;
String style = "margin: " + topMargin + "in " + rightMargin + "in "
+ bottomMargin + "in " + leftMargin + "in;";
if ( section.getNumColumns() > 1 )
{
style += "column-count: " + ( section.getNumColumns() ) + ";";
if ( section.isColumnsEvenlySpaced() )
{
float distance = section.getDistanceBetweenColumns()
/ TWIPS_PER_INCH;
style += "column-gap: " + distance + "in;";
}
else
{
style += "column-gap: 0.25in;";
}
}
return style;
}
public File convertToFile(){
String slashType = (filePath.lastIndexOf("\\") > 0) ? "\\" : "/"; // Windows or UNIX
String wordFileName = filePath.substring(filePath.lastIndexOf(slashType) + 1, filePath.length());
File htmlFile = new File(tempDirectory + wordFileName + ".htm");
if(!htmlFile.exists()){
try
{
Document doc = DocToHtmlConverter.process( new File(filePath) );
// FileWriter out = new FileWriter(htmlFile);
Writer out = new OutputStreamWriter(new FileOutputStream(htmlFile), "UTF-8");
DOMSource domSource = new DOMSource( doc );
StreamResult streamResult = new StreamResult( out );
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
// TODO set encoding from a command argument
serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
serializer.setOutputProperty( OutputKeys.INDENT, "yes" );
serializer.setOutputProperty( OutputKeys.METHOD, "html" );
serializer.transform( domSource, streamResult );
out.flush();
out.close();
}
catch ( Exception e )
{
e.printStackTrace();
}
}
return htmlFile;
}
static Document process( File docFile ) throws Exception
{
final HWPFDocumentCore wordDocument = WordToHtmlUtils.loadDoc( docFile );
DocToHtmlConverter wordToHtmlConverter = new DocToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument() );
wordToHtmlConverter.processDocument( wordDocument );
return wordToHtmlConverter.getDocument();
}
private final Stack<BlockProperies> blocksProperies = new Stack<BlockProperies>();
private final HtmlDocumentFacade htmlDocumentFacade;
private Element notes = null;
public DocToHtmlConverter( Document document )
{
this.htmlDocumentFacade = new HtmlDocumentFacade( document );
}
public DocToHtmlConverter( HtmlDocumentFacade htmlDocumentFacade )
{
this.htmlDocumentFacade = htmlDocumentFacade;
}
@Override
protected void afterProcess()
{
if ( notes != null )
htmlDocumentFacade.getBody().appendChild( notes );
htmlDocumentFacade.updateStylesheet();
}
public Document getDocument()
{
return htmlDocumentFacade.getDocument();
}
@Override
protected void outputCharacters( Element pElement,
CharacterRun characterRun, String text )
{
Element span = htmlDocumentFacade.getDocument().createElement( "span" );
pElement.appendChild( span );
StringBuilder style = new StringBuilder();
BlockProperies blockProperies = this.blocksProperies.peek();
Triplet triplet = getCharacterRunTriplet( characterRun );
if (WordToHtmlUtils.isNotEmpty( triplet.fontName )
&& !WordToHtmlUtils.equals( triplet.fontName,
blockProperies.pFontName ) )
{
style.append( "font-family:" + triplet.fontName + ";" );
}
if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize )
{
style.append( "font-size:" + characterRun.getFontSize() / 2 + "pt;" );
}
if ( triplet.bold )
{
style.append( "font-weight:bold;" );
}
if ( triplet.italic )
{
style.append( "font-style:italic;" );
}
WordToHtmlUtils.addCharactersProperties( characterRun, style );
if ( style.length() != 0 )
htmlDocumentFacade.addStyleClass( span, "s", style.toString() );
Text textNode = htmlDocumentFacade.createText( text );
span.appendChild( textNode );
}
@Override
protected void processBookmarks( HWPFDocumentCore wordDocument,
Element currentBlock, Range range, int currentTableLevel,
List<Bookmark> rangeBookmarks )
{
Element parent = currentBlock;
for ( Bookmark bookmark : rangeBookmarks )
{
Element bookmarkElement = htmlDocumentFacade
.createBookmark( bookmark.getName() );
parent.appendChild( bookmarkElement );
parent = bookmarkElement;
}
if ( range != null )
processCharacters( wordDocument, currentTableLevel, range, parent );
}
@Override
protected void processDocumentInformation(
SummaryInformation summaryInformation )
{
if (WordToHtmlUtils.isNotEmpty( summaryInformation.getTitle() ) )
htmlDocumentFacade.setTitle( summaryInformation.getTitle() );
if (WordToHtmlUtils.isNotEmpty( summaryInformation.getAuthor() ) )
htmlDocumentFacade.addAuthor( summaryInformation.getAuthor() );
if (WordToHtmlUtils.isNotEmpty( summaryInformation.getKeywords() ) )
htmlDocumentFacade.addKeywords( summaryInformation.getKeywords() );
if (WordToHtmlUtils.isNotEmpty( summaryInformation.getComments() ) )
htmlDocumentFacade
.addDescription( summaryInformation.getComments() );
}
@Override
public void processDocumentPart( HWPFDocumentCore wordDocument, Range range )
{
super.processDocumentPart( wordDocument, range );
afterProcess();
}
@Override
protected void processDrawnObject( HWPFDocument doc,
CharacterRun characterRun, OfficeDrawing officeDrawing,
String path, Element block )
{
Element img = htmlDocumentFacade.createImage( path );
block.appendChild( img );
}
@Override
protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
int noteIndex, Element block, Range endnoteTextRange )
{
processNoteAutonumbered( wordDocument, "end", noteIndex, block,
endnoteTextRange );
}
@Override
protected void processFootnoteAutonumbered( HWPFDocument wordDocument,
int noteIndex, Element block, Range footnoteTextRange )
{
processNoteAutonumbered( wordDocument, "foot", noteIndex, block,
footnoteTextRange );
}
@Override
protected void processHyperlink( HWPFDocumentCore wordDocument,
Element currentBlock, Range textRange, int currentTableLevel,
String hyperlink )
{
Element basicLink = htmlDocumentFacade.createHyperlink( hyperlink );
currentBlock.appendChild( basicLink );
if ( textRange != null )
processCharacters( wordDocument, currentTableLevel, textRange,
basicLink );
}
protected void processImage( Element currentBlock, boolean inlined,
Picture picture, String imageSourcePath )
{
final int aspectRatioX = picture.getHorizontalScalingFactor();
final int aspectRatioY = picture.getVerticalScalingFactor();
StringBuilder style = new StringBuilder();
final float imageWidth;
final float imageHeight;
final float cropTop;
final float cropBottom;
final float cropLeft;
final float cropRight;
if ( aspectRatioX > 0 )
{
imageWidth = picture.getDxaGoal() * aspectRatioX / 1000
/ TWIPS_PER_INCH;
cropRight = picture.getDxaCropRight() * aspectRatioX / 1000
/ TWIPS_PER_INCH;
cropLeft = picture.getDxaCropLeft() * aspectRatioX / 1000
/ TWIPS_PER_INCH;
}
else
{
imageWidth = picture.getDxaGoal() / TWIPS_PER_INCH;
cropRight = picture.getDxaCropRight() / TWIPS_PER_INCH;
cropLeft = picture.getDxaCropLeft() / TWIPS_PER_INCH;
}
if ( aspectRatioY > 0 )
{
imageHeight = picture.getDyaGoal() * aspectRatioY / 1000
/ TWIPS_PER_INCH;
cropTop = picture.getDyaCropTop() * aspectRatioY / 1000
/ TWIPS_PER_INCH;
cropBottom = picture.getDyaCropBottom() * aspectRatioY / 1000
/ TWIPS_PER_INCH;
}
else
{
imageHeight = picture.getDyaGoal() / TWIPS_PER_INCH;
cropTop = picture.getDyaCropTop() / TWIPS_PER_INCH;
cropBottom = picture.getDyaCropBottom() / TWIPS_PER_INCH;
}
Element root;
if ( cropTop != 0 || cropRight != 0 || cropBottom != 0 || cropLeft != 0 )
{
float visibleWidth = Math
.max( 0, imageWidth - cropLeft - cropRight );
float visibleHeight = Math.max( 0, imageHeight - cropTop
- cropBottom );
root = htmlDocumentFacade.createBlock();
htmlDocumentFacade.addStyleClass( root, "d",
"vertical-align:text-bottom;width:" + visibleWidth
+ "in;height:" + visibleHeight + "in;" );
// complex
Element inner = htmlDocumentFacade.createBlock();
htmlDocumentFacade.addStyleClass( inner, "d",
"position:relative;width:" + visibleWidth + "in;height:"
+ visibleHeight + "in;overflow:hidden;" );
root.appendChild( inner );
Element image = htmlDocumentFacade.createImage( imageSourcePath );
htmlDocumentFacade.addStyleClass( image, "i",
"position:absolute;left:-" + cropLeft + ";top:-" + cropTop
+ ";width:" + imageWidth + "in;height:"
+ imageHeight + "in;" );
inner.appendChild( image );
style.append( "overflow:hidden;" );
}
else
{
root = htmlDocumentFacade.createImage( imageSourcePath );
root.setAttribute( "style", "width:" + imageWidth + "in;height:"
+ imageHeight + "in;vertical-align:text-bottom;" );
}
currentBlock.appendChild( root );
}
@Override
protected void processImageWithoutPicturesManager( Element currentBlock,
boolean inlined, Picture picture )
{
// no default implementation -- skip
currentBlock.appendChild( htmlDocumentFacade.getDocument()
.createComment( "Image link to '"
+ picture.suggestFullFileName() + "' can be here" ) );
}
@Override
protected void processLineBreak( Element block, CharacterRun characterRun )
{
block.appendChild( htmlDocumentFacade.createLineBreak() );
}
protected void processNoteAutonumbered( HWPFDocument doc, String type,
int noteIndex, Element block, Range noteTextRange )
{
final String textIndex = String.valueOf( noteIndex + 1 );
final String textIndexClass = htmlDocumentFacade.getOrCreateCssClass(
"a", "vertical-align:super;font-size:smaller;" );
final String forwardNoteLink = type + "note_" + textIndex;
final String backwardNoteLink = type + "note_back_" + textIndex;
Element anchor = htmlDocumentFacade.createHyperlink( "#"
+ forwardNoteLink );
anchor.setAttribute( "name", backwardNoteLink );
anchor.setAttribute( "class", textIndexClass + " " + type
+ "noteanchor" );
anchor.setTextContent( textIndex );
block.appendChild( anchor );
if ( notes == null )
{
notes = htmlDocumentFacade.createBlock();
notes.setAttribute( "class", "notes" );
}
Element note = htmlDocumentFacade.createBlock();
note.setAttribute( "class", type + "note" );
notes.appendChild( note );
Element bookmark = htmlDocumentFacade.createBookmark( forwardNoteLink );
bookmark.setAttribute( "href", "#" + backwardNoteLink );
bookmark.setTextContent( textIndex );
bookmark.setAttribute( "class", textIndexClass + " " + type
+ "noteindex" );
note.appendChild( bookmark );
note.appendChild( htmlDocumentFacade.createText( " " ) );
Element span = htmlDocumentFacade.getDocument().createElement( "span" );
span.setAttribute( "class", type + "notetext" );
note.appendChild( span );
this.blocksProperies.add( new BlockProperies( "", -1 ) );
try
{
processCharacters( doc, Integer.MIN_VALUE, noteTextRange, span );
}
finally
{
this.blocksProperies.pop();
}
}
@Override
protected void processPageBreak( HWPFDocumentCore wordDocument, Element flow )
{
flow.appendChild( htmlDocumentFacade.createLineBreak() );
}
protected void processPageref( HWPFDocumentCore hwpfDocument,
Element currentBlock, Range textRange, int currentTableLevel,
String pageref )
{
Element basicLink = htmlDocumentFacade.createHyperlink( "#" + pageref );
currentBlock.appendChild( basicLink );
if ( textRange != null )
processCharacters( hwpfDocument, currentTableLevel, textRange,
basicLink );
}
protected void processParagraph( HWPFDocumentCore hwpfDocument,
Element parentElement, int currentTableLevel, Paragraph paragraph,
String bulletText )
{
final Element pElement = htmlDocumentFacade.createParagraph();
parentElement.appendChild( pElement );
StringBuilder style = new StringBuilder();
WordToHtmlUtils.addParagraphProperties( paragraph, style );
final int charRuns = paragraph.numCharacterRuns();
if ( charRuns == 0 )
{
return;
}
{
final String pFontName;
final int pFontSize;
final CharacterRun characterRun = paragraph.getCharacterRun( 0 );
if ( characterRun != null )
{
Triplet triplet = getCharacterRunTriplet( characterRun );
pFontSize = characterRun.getFontSize() / 2;
pFontName = triplet.fontName;
WordToHtmlUtils.addFontFamily( pFontName, style );
WordToHtmlUtils.addFontSize( pFontSize, style );
}
else
{
pFontSize = -1;
pFontName = "";
}
blocksProperies.push( new BlockProperies( pFontName, pFontSize ) );
}
try
{
if (WordToHtmlUtils.isNotEmpty( bulletText ) )
{
if ( bulletText.endsWith( "\t" ) )
{
/*
* We don't know how to handle all cases in HTML, but at
* least simplest case shall be handled
*/
final float defaultTab = TWIPS_PER_INCH / 2;
float firstLinePosition = paragraph.getIndentFromLeft()
+ paragraph.getFirstLineIndent() + 20; // char have
// some space
float nextStop = (float) ( Math.ceil( firstLinePosition
/ defaultTab ) * defaultTab );
final float spanMinWidth = nextStop - firstLinePosition;
Element span = htmlDocumentFacade.getDocument()
.createElement( "span" );
htmlDocumentFacade
.addStyleClass( span, "s",
"display: inline-block; text-indent: 0; min-width: "
+ ( spanMinWidth / TWIPS_PER_INCH )
+ "in;" );
pElement.appendChild( span );
Text textNode = htmlDocumentFacade.createText( bulletText
.substring( 0, bulletText.length() - 1 )
+ UNICODECHAR_ZERO_WIDTH_SPACE
+ UNICODECHAR_NO_BREAK_SPACE );
span.appendChild( textNode );
}
else
{
Text textNode = htmlDocumentFacade.createText( bulletText
.substring( 0, bulletText.length() - 1 ) );
pElement.appendChild( textNode );
}
}
processCharacters( hwpfDocument, currentTableLevel, paragraph,
pElement );
}
finally
{
blocksProperies.pop();
}
if ( style.length() > 0 )
htmlDocumentFacade.addStyleClass( pElement, "p", style.toString() );
WordToHtmlUtils.compactSpans( pElement );
return;
}
protected void processSection( HWPFDocumentCore wordDocument,
Section section, int sectionCounter )
{
Element div = htmlDocumentFacade.createBlock();
htmlDocumentFacade.addStyleClass( div, "d", getSectionStyle( section ) );
htmlDocumentFacade.getBody().appendChild( div );
processParagraphes( wordDocument, div, section, Integer.MIN_VALUE );
}
@Override
protected void processSingleSection( HWPFDocumentCore wordDocument,
Section section )
{
htmlDocumentFacade.addStyleClass( htmlDocumentFacade.getBody(), "b",
getSectionStyle( section ) );
processParagraphes( wordDocument, htmlDocumentFacade.getBody(), section,
Integer.MIN_VALUE );
}
}
这是开始转换文件的方法:
public File convertToFile(){
String slashType = (filePath.lastIndexOf("\\") > 0) ? "\\" : "/"; // Windows or UNIX
String wordFileName = filePath.substring(filePath.lastIndexOf(slashType) + 1, filePath.length());
File htmlFile = new File(tempDirectory + wordFileName + ".htm");
if(!htmlFile.exists()){
try
{
Document doc = DocToHtmlConverter.process( new File(filePath) );
// FileWriter out = new FileWriter(htmlFile);
Writer out = new OutputStreamWriter(new FileOutputStream(htmlFile), "UTF-8");
DOMSource domSource = new DOMSource( doc );
StreamResult streamResult = new StreamResult( out );
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
// TODO set encoding from a command argument
serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
serializer.setOutputProperty( OutputKeys.INDENT, "yes" );
serializer.setOutputProperty( OutputKeys.METHOD, "html" );
serializer.transform( domSource, streamResult );
out.flush();
out.close();
}
catch ( Exception e )
{
e.printStackTrace();
}
}
return htmlFile;
}