我有一些将 Word 文档转换为 HTML 的软件。我需要的是一种获取文档文本的方法。下面是我的代码示例。当文档转换为 html 时,文档似乎被写入 html 标签。我对 html 和 web 开发没有太多经验。我主要是应用程序和桌面开发人员。什么是获取段落文本的简单方法。以下是我到目前为止的代码。我需要将信息作为字符串获取,以便解析它。
private void convertWordDoctoHTML(File file) throws ParserConfigurationException, TransformerConfigurationException, TransformerException, IOException {
HWPFDocumentCore wordDocument = null;
try {
wordDocument = WordToHtmlUtils.loadDoc(new FileInputStream(file));
} catch (IOException ex) {
Exceptions.printStackTrace(ex);
}
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
wordToHtmlConverter.processDocument(wordDocument);
org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();
ByteArrayOutputStream out = new ByteArrayOutputStream();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(out);
StringWriter stringWriter = new StringWriter();
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
out.close();
String result = new String(out.toByteArray());
System.out.println(result);
docEditorPane.setPage(selectedFile.toURI().toURL());
newDocText = result;
}