我继承了一个数据存储,它使用简单的文本文件来保存文档。
文档有一些属性(日期、标题和文本),它们被编码在一个文件名中:<date>-<title>.txt,文件的主体是文本。
然而实际上系统中的文档有更多的属性,甚至更多的再次被提议添加。
切换到 XML 格式似乎是合乎逻辑的,我已经这样做了,现在每个文档都编码在它自己的 XML 文件中。
然而,从 XML 中读取文件现在非常慢!(以前 .txt 格式的 2000 篇文章需要几秒钟,现在 .xml 格式的 2000 篇文章需要 10 多分钟)。
我使用的是 DOM 解析器,在发现读取速度有多慢后,我切换到了 SAX 解析器,但它仍然很慢(嗯,更快,但仍然是 10 分钟)。
XML 就是这么慢,还是我在做一些奇怪的事情?任何想法将不胜感激。
该系统是用 JavaSE 1.6 编写的。解析器是这样创建的:
/*
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
*/
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser;
try {
saxParser = factory.newSAXParser();
ArticleSaxHandler handler = new ArticleSaxHandler();
saxParser.parse(is, handler);
return handler.getArticle();
} catch (ParserConfigurationException e) {
throw new IOException(e);
} catch (SAXException e) {
throw new IOException(e);
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
logger.error(e);
}
}
}
}
private class ArticleSaxHandler extends DefaultHandler {
private URI uri = null;
private String source = null;
private String author = null;
private DateTime articleDatetime = null;
private DateTime processedDatetime = null;
private String title = null;
private String text = null;
private ArticleElement currentElement;
private final StringBuilder builder = new StringBuilder();
public Article getArticle() {
return new Article(uri, source, author, articleDatetime, processedDatetime, title, text);
}
/** Receive notification of the start of an element. */
public void startElement(String uri, String localName, String qName, Attributes attributes) {
if (builder.length() != 0) {
throw new RuntimeException(new SAXParseException(currentElement + " was not finished before " + qName + " was started", null));
}
currentElement = ArticleElement.getElement(qName);
}
public void endElement(String uri, String localName, String qName) {
final String elementText = builder.toString();
builder.delete(0, builder.length());
if (currentElement == null) {
return;
}
switch (currentElement) {
case ARTICLE:
break;
case URI:
try {
this.uri = new URI(elementText);
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
break;
case SOURCE:
source = elementText;
break;
case AUTHOR:
author = elementText;
break;
case ARTICLE_DATE_TIME:
articleDatetime = getDateTimeFormatter().parseDateTime(elementText);
break;
case PROCESSED_DATE_TIME:
processedDatetime = getDateTimeFormatter().parseDateTime(elementText);
break;
case TITLE:
title = elementText;
break;
case TEXT:
this.text = elementText;
break;
default:
throw new IllegalStateException("Unexpected ArticleElement: " + currentElement);
}
currentElement = null;
}
/** Receive notification of character data inside an element. */
public void characters(char[] ch, int start, int length) {
builder.append(ch, start, length);
}
public void error(SAXParseException e) {
fatalError(e);
}
public void fatalError(SAXParseException e) {
logger.error("currentElement: " + currentElement + " ||builder: " + builder.toString() + "\n\n" + e.getMessage(), e);
}
}
private enum ArticleElement {
ARTICLE(ARTICLE_ELEMENT_NAME), URI(URI_ELEMENT_NAME), SOURCE(SOURCE_ELEMENT_NAME), AUTHOR(AUTHOR_ELEMENT_NAME), ARTICLE_DATE_TIME(
ARTICLE_DATETIME_ELEMENT_NAME), PROCESSED_DATE_TIME(PROCESSED_DATETIME_ELEMENT_NAME), TITLE(TITLE_ELEMENT_NAME), TEXT(TEXT_ELEMENT_NAME);
private String name;
private ArticleElement(String name) {
this.name = name;
}
public static ArticleElement getElement(String qName) {
for (ArticleElement element : ArticleElement.values()) {
if (element.name.equals(qName)) {
return element;
}
}
return null;
}
}