我刚刚发现 Apache POI 库对于使用 Java 编辑 Word 文件非常有用。具体来说,我想使用 Apache POI 的 XWPF 类编辑DOCX文件。我发现没有合适的方法/文档可以做到这一点。有人可以逐步解释如何替换 DOCX 文件中的某些文本。
** 文本可以在一行/段落或表格行/列中
提前致谢 :)
我刚刚发现 Apache POI 库对于使用 Java 编辑 Word 文件非常有用。具体来说,我想使用 Apache POI 的 XWPF 类编辑DOCX文件。我发现没有合适的方法/文档可以做到这一点。有人可以逐步解释如何替换 DOCX 文件中的某些文本。
** 文本可以在一行/段落或表格行/列中
提前致谢 :)
您需要的方法是XWPFRun.setText(String)。只需遍历文件,直到找到感兴趣的 XWPFRun,找出您想要的新文本,然后替换它。(运行是具有相同格式的文本序列)
您应该能够执行以下操作:
XWPFDocument doc = new XWPFDocument(OPCPackage.open("input.docx"));
for (XWPFParagraph p : doc.getParagraphs()) {
List<XWPFRun> runs = p.getRuns();
if (runs != null) {
for (XWPFRun r : runs) {
String text = r.getText(0);
if (text != null && text.contains("needle")) {
text = text.replace("needle", "haystack");
r.setText(text, 0);
}
}
}
}
for (XWPFTable tbl : doc.getTables()) {
for (XWPFTableRow row : tbl.getRows()) {
for (XWPFTableCell cell : row.getTableCells()) {
for (XWPFParagraph p : cell.getParagraphs()) {
for (XWPFRun r : p.getRuns()) {
String text = r.getText(0);
if (text != null && text.contains("needle")) {
text = text.replace("needle", "haystack");
r.setText(text,0);
}
}
}
}
}
}
doc.write(new FileOutputStream("output.docx"));
这是我们使用 Apache POI 为文本替换所做的。我们发现替换整个 XWPFParagraph 的文本而不是运行是不值得的麻烦和简单的。运行可以在单词中间随机拆分,因为 Microsoft Word 负责在文档段落中创建运行的位置。因此,您可能正在搜索的文本可能是一次运行一半,另一次运行一半。使用段落的全文,删除其现有的运行,并使用调整后的文本添加新的运行似乎可以解决文本替换的问题。
但是,在段落级别进行替换是有成本的;您会丢失该段落中运行的格式。例如,如果您在段落的中间加粗了单词“bits”,然后在解析文件时将单词“bits”替换为“bytes”,则单词“bytes”将不再加粗。因为当段落的整个正文被替换时,粗体与一个被删除的运行一起存储。附加的代码有一个注释掉的部分,如果需要,它可以在运行级别替换文本。
还应注意,如果您插入的文本包含 \n 返回字符,则以下内容有效。如果不在返回之前为每个部分创建运行并标记运行 addCarriageReturn(),我们无法找到插入返回的方法。干杯
package com.healthpartners.hcss.client.external.word.replacement;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
public class TextReplacer {
private String searchValue;
private String replacement;
public TextReplacer(String searchValue, String replacement) {
this.searchValue = searchValue;
this.replacement = replacement;
}
public void replace(XWPFDocument document) {
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph xwpfParagraph : paragraphs) {
replace(xwpfParagraph);
}
}
private void replace(XWPFParagraph paragraph) {
if (hasReplaceableItem(paragraph.getText())) {
String replacedText = StringUtils.replace(paragraph.getText(), searchValue, replacement);
removeAllRuns(paragraph);
insertReplacementRuns(paragraph, replacedText);
}
}
private void insertReplacementRuns(XWPFParagraph paragraph, String replacedText) {
String[] replacementTextSplitOnCarriageReturn = StringUtils.split(replacedText, "\n");
for (int j = 0; j < replacementTextSplitOnCarriageReturn.length; j++) {
String part = replacementTextSplitOnCarriageReturn[j];
XWPFRun newRun = paragraph.insertNewRun(j);
newRun.setText(part);
if (j+1 < replacementTextSplitOnCarriageReturn.length) {
newRun.addCarriageReturn();
}
}
}
private void removeAllRuns(XWPFParagraph paragraph) {
int size = paragraph.getRuns().size();
for (int i = 0; i < size; i++) {
paragraph.removeRun(0);
}
}
private boolean hasReplaceableItem(String runText) {
return StringUtils.contains(runText, searchValue);
}
//REVISIT The below can be removed if Michele tests and approved the above less versatile replacement version
// private void replace(XWPFParagraph paragraph) {
// for (int i = 0; i < paragraph.getRuns().size() ; i++) {
// i = replace(paragraph, i);
// }
// }
// private int replace(XWPFParagraph paragraph, int i) {
// XWPFRun run = paragraph.getRuns().get(i);
//
// String runText = run.getText(0);
//
// if (hasReplaceableItem(runText)) {
// return replace(paragraph, i, run);
// }
//
// return i;
// }
// private int replace(XWPFParagraph paragraph, int i, XWPFRun run) {
// String runText = run.getCTR().getTArray(0).getStringValue();
//
// String beforeSuperLong = StringUtils.substring(runText, 0, runText.indexOf(searchValue));
//
// String[] replacementTextSplitOnCarriageReturn = StringUtils.split(replacement, "\n");
//
// String afterSuperLong = StringUtils.substring(runText, runText.indexOf(searchValue) + searchValue.length());
//
// Counter counter = new Counter(i);
//
// insertNewRun(paragraph, run, counter, beforeSuperLong);
//
// for (int j = 0; j < replacementTextSplitOnCarriageReturn.length; j++) {
// String part = replacementTextSplitOnCarriageReturn[j];
//
// XWPFRun newRun = insertNewRun(paragraph, run, counter, part);
//
// if (j+1 < replacementTextSplitOnCarriageReturn.length) {
// newRun.addCarriageReturn();
// }
// }
//
// insertNewRun(paragraph, run, counter, afterSuperLong);
//
// paragraph.removeRun(counter.getCount());
//
// return counter.getCount();
// }
// private class Counter {
// private int i;
//
// public Counter(int i) {
// this.i = i;
// }
//
// public void increment() {
// i++;
// }
//
// public int getCount() {
// return i;
// }
// }
// private XWPFRun insertNewRun(XWPFParagraph xwpfParagraph, XWPFRun run, Counter counter, String newText) {
// XWPFRun newRun = xwpfParagraph.insertNewRun(counter.i);
// newRun.getCTR().set(run.getCTR());
// newRun.getCTR().getTArray(0).setStringValue(newText);
//
// counter.increment();
//
// return newRun;
// }
我的任务是用 word docx 文档中的地图值替换格式为 ${key} 的文本。上述解决方案是一个很好的起点,但并未考虑所有情况:${key} 不仅可以分布在多个运行中,还可以分布在运行中的多个文本中。因此,我最终得到了以下代码:
private void replace(String inFile, Map<String, String> data, OutputStream out) throws Exception, IOException {
XWPFDocument doc = new XWPFDocument(OPCPackage.open(inFile));
for (XWPFParagraph p : doc.getParagraphs()) {
replace2(p, data);
}
for (XWPFTable tbl : doc.getTables()) {
for (XWPFTableRow row : tbl.getRows()) {
for (XWPFTableCell cell : row.getTableCells()) {
for (XWPFParagraph p : cell.getParagraphs()) {
replace2(p, data);
}
}
}
}
doc.write(out);
}
private void replace2(XWPFParagraph p, Map<String, String> data) {
String pText = p.getText(); // complete paragraph as string
if (pText.contains("${")) { // if paragraph does not include our pattern, ignore
TreeMap<Integer, XWPFRun> posRuns = getPosToRuns(p);
Pattern pat = Pattern.compile("\\$\\{(.+?)\\}");
Matcher m = pat.matcher(pText);
while (m.find()) { // for all patterns in the paragraph
String g = m.group(1); // extract key start and end pos
int s = m.start(1);
int e = m.end(1);
String key = g;
String x = data.get(key);
if (x == null)
x = "";
SortedMap<Integer, XWPFRun> range = posRuns.subMap(s - 2, true, e + 1, true); // get runs which contain the pattern
boolean found1 = false; // found $
boolean found2 = false; // found {
boolean found3 = false; // found }
XWPFRun prevRun = null; // previous run handled in the loop
XWPFRun found2Run = null; // run in which { was found
int found2Pos = -1; // pos of { within above run
for (XWPFRun r : range.values())
{
if (r == prevRun)
continue; // this run has already been handled
if (found3)
break; // done working on current key pattern
prevRun = r;
for (int k = 0;; k++) { // iterate over texts of run r
if (found3)
break;
String txt = null;
try {
txt = r.getText(k); // note: should return null, but throws exception if the text does not exist
} catch (Exception ex) {
}
if (txt == null)
break; // no more texts in the run, exit loop
if (txt.contains("$") && !found1) { // found $, replace it with value from data map
txt = txt.replaceFirst("\\$", x);
found1 = true;
}
if (txt.contains("{") && !found2 && found1) {
found2Run = r; // found { replace it with empty string and remember location
found2Pos = txt.indexOf('{');
txt = txt.replaceFirst("\\{", "");
found2 = true;
}
if (found1 && found2 && !found3) { // find } and set all chars between { and } to blank
if (txt.contains("}"))
{
if (r == found2Run)
{ // complete pattern was within a single run
txt = txt.substring(0, found2Pos)+txt.substring(txt.indexOf('}'));
}
else // pattern spread across multiple runs
txt = txt.substring(txt.indexOf('}'));
}
else if (r == found2Run) // same run as { but no }, remove all text starting at {
txt = txt.substring(0, found2Pos);
else
txt = ""; // run between { and }, set text to blank
}
if (txt.contains("}") && !found3) {
txt = txt.replaceFirst("\\}", "");
found3 = true;
}
r.setText(txt, k);
}
}
}
System.out.println(p.getText());
}
}
private TreeMap<Integer, XWPFRun> getPosToRuns(XWPFParagraph paragraph) {
int pos = 0;
TreeMap<Integer, XWPFRun> map = new TreeMap<Integer, XWPFRun>();
for (XWPFRun run : paragraph.getRuns()) {
String runText = run.text();
if (runText != null && runText.length() > 0) {
for (int i = 0; i < runText.length(); i++) {
map.put(pos + i, run);
}
pos += runText.length();
}
}
return map;
}
如果有人还需要保留文本的格式,则此代码效果更好。
private static Map<Integer, XWPFRun> getPosToRuns(XWPFParagraph paragraph) {
int pos = 0;
Map<Integer, XWPFRun> map = new HashMap<Integer, XWPFRun>(10);
for (XWPFRun run : paragraph.getRuns()) {
String runText = run.text();
if (runText != null) {
for (int i = 0; i < runText.length(); i++) {
map.put(pos + i, run);
}
pos += runText.length();
}
}
return (map);
}
public static <V> void replace(XWPFDocument document, Map<String, V> map) {
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph paragraph : paragraphs) {
replace(paragraph, map);
}
}
public static <V> void replace(XWPFDocument document, String searchText, V replacement) {
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph paragraph : paragraphs) {
replace(paragraph, searchText, replacement);
}
}
private static <V> void replace(XWPFParagraph paragraph, Map<String, V> map) {
for (Map.Entry<String, V> entry : map.entrySet()) {
replace(paragraph, entry.getKey(), entry.getValue());
}
}
public static <V> void replace(XWPFParagraph paragraph, String searchText, V replacement) {
boolean found = true;
while (found) {
found = false;
int pos = paragraph.getText().indexOf(searchText);
if (pos >= 0) {
found = true;
Map<Integer, XWPFRun> posToRuns = getPosToRuns(paragraph);
XWPFRun run = posToRuns.get(pos);
XWPFRun lastRun = posToRuns.get(pos + searchText.length() - 1);
int runNum = paragraph.getRuns().indexOf(run);
int lastRunNum = paragraph.getRuns().indexOf(lastRun);
String texts[] = replacement.toString().split("\n");
run.setText(texts[0], 0);
XWPFRun newRun = run;
for (int i = 1; i < texts.length; i++) {
newRun.addCarriageReturn();
newRun = paragraph.insertNewRun(runNum + i);
/*
We should copy all style attributes
to the newRun from run
also from background color, ...
Here we duplicate only the simple attributes...
*/
newRun.setText(texts[i]);
newRun.setBold(run.isBold());
newRun.setCapitalized(run.isCapitalized());
// newRun.setCharacterSpacing(run.getCharacterSpacing());
newRun.setColor(run.getColor());
newRun.setDoubleStrikethrough(run.isDoubleStrikeThrough());
newRun.setEmbossed(run.isEmbossed());
newRun.setFontFamily(run.getFontFamily());
newRun.setFontSize(run.getFontSize());
newRun.setImprinted(run.isImprinted());
newRun.setItalic(run.isItalic());
newRun.setKerning(run.getKerning());
newRun.setShadow(run.isShadowed());
newRun.setSmallCaps(run.isSmallCaps());
newRun.setStrikeThrough(run.isStrikeThrough());
newRun.setSubscript(run.getSubscript());
newRun.setUnderline(run.getUnderline());
}
for (int i = lastRunNum + texts.length - 1; i > runNum + texts.length - 1; i--) {
paragraph.removeRun(i);
}
}
}
}
有用(参数)replaceParagraph
替换${key}
并通过合并内容来保存格式的实现。value
fieldsForReport
runs
${key}
private void replaceParagraph(XWPFParagraph paragraph, Map<String, String> fieldsForReport) throws POIXMLException {
String find, text, runsText;
List<XWPFRun> runs;
XWPFRun run, nextRun;
for (String key : fieldsForReport.keySet()) {
text = paragraph.getText();
if (!text.contains("${"))
return;
find = "${" + key + "}";
if (!text.contains(find))
continue;
runs = paragraph.getRuns();
for (int i = 0; i < runs.size(); i++) {
run = runs.get(i);
runsText = run.getText(0);
if (runsText.contains("${") || (runsText.contains("$") && runs.get(i + 1).getText(0).substring(0, 1).equals("{"))) {
//As the next run may has a closed tag and an open tag at
//the same time, we have to be sure that our building string
//has a fully completed tags
while (!openTagCountIsEqualCloseTagCount(runsText))) {
nextRun = runs.get(i + 1);
runsText = runsText + nextRun.getText(0);
paragraph.removeRun(i + 1);
}
run.setText(runsText.contains(find) ?
runsText.replace(find, fieldsForReport.get(key)) :
runsText, 0);
}
}
}
}
private boolean openTagCountIsEqualCloseTagCount(String runText) {
int openTagCount = runText.split("\\$\\{", -1).length - 1;
int closeTagCount = runText.split("}", -1).length - 1;
return openTagCount == closeTagCount;
}
第一段代码给了我一个 NullPointerException,有人知道出了什么问题吗?
run.getText(int position) - 来自文档:返回:此文本运行的文本,如果未设置,则返回 null
在调用 contains() 之前检查它是否不为空
顺便说一句,如果要替换文本,则需要将其设置在从中获取它的位置,在本例中为 r.setText(text, 0);。否则文本将被添加而不是替换
截至撰写本文之日,没有一个答案可以正确替换。
Gagravars 的答案不包括要替换的单词在运行中拆分的情况;Thierry Boduins 解决方案有时会在替换其他单词之后将单词替换为空白,而且它也不检查表格。
如果两个运行的文本都包含要替换的单词,则使用 Gagtavars 答案作为基础,我还检查了当前运行之前的运行,添加 else 块。我在 kotlin 中的补充:
if (text != null) {
if (text.contains(findText)) {
text = text.replace(findText, replaceText)
r.setText(text, 0)
} else if (i > 0 && p.runs[i - 1].getText(0).plus(text).contains(findText)) {
val pos = p.runs[i - 1].getText(0).indexOf('$')
text = textOfNotFullSecondRun(text, findText)
r.setText(text, 0)
val findTextLengthInFirstRun = findTextPartInFirstRun(p.runs[i - 1].getText(0), findText)
val prevRunText = p.runs[i - 1].getText(0).replaceRange(pos, findTextLengthInFirstRun, replaceText)
p.runs[i - 1].setText(prevRunText, 0)
}
}
private fun textOfNotFullSecondRun(text: String, findText: String): String {
return if (!text.contains(findText)) {
textOfNotFullSecondRun(text, findText.drop(1))
} else {
text.replace(findText, "")
}
}
private fun findTextPartInFirstRun(text: String, findText: String): Int {
return if (text.contains(findText)) {
findText.length
} else {
findTextPartInFirstRun(text, findText.dropLast(1))
}
}
它是段落中的运行列表。与表中的搜索块相同。有了这个解决方案,我还没有任何问题。所有格式都完好无损。
编辑:我制作了一个用于替换的 java 库,请查看:https ://github.com/deividasstr/docx-word-replacer
此处接受的答案需要与 Justin Skiles 更新一起再进行一次更新。r.setText(文本, 0); 原因:如果不使用 pos 变量更新 setText,输出将是旧字符串和替换字符串的组合。
我建议我在 # 之间替换文本的解决方案,例如:This #bookmark# should be Replace。 它被替换为:
此外,它还考虑了符号 # 和书签在单独运行中的情况(在不同运行之间替换变量)。
这里链接到代码:https ://gist.github.com/aerobium/bf02e443c079c5caec7568e167849dda
基于 Dmitry Stolbov 在这里的回答以及它遇到的问题和限制以及我在下面的类中提供的其余响应,它实现了在段落和表格中搜索的方法 generateDocument。
在这里,我解决了在回复中发现的几个问题,例如:
这很好用,但我需要一些关于如何解决我遇到的问题的见解。有时文件中要替换的值大于要替换的标签,这最终会搞砸对齐。例如:
发生的事情是 {#branch#} 和 {#insurCompanyCorporateName#} 被更大的字符串替换,在 {#branch#} 标记之后有几个“\t”元素,再加上 {#insurCompanyCorporateName# } 值也大于标签,将内容向前推使其拆分到下一行。
我想知道是否有人对我在运行时如何理解我要替换的值是否使文档分割线或弄乱页面中其他元素的位置有一些见解。在这种情况下,我希望我的程序理解他应该在分支之后删除一些“\t”。或者也许将 {#insurCompanyCorporateName#} 拆分为新行,但使新行开始低于原始标签或其他内容。
想法?
班上:
package com.idoine.struts2.action.shared;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.usermodel.*;
import org.json.JSONObject;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.List;
/**
* Created by migue on 11/11/2020.
*/
public class DocumentGeneratorAction {
public static ByteArrayInputStream generateDocument(String templatePath, JSONObject fields){
/** used as reference: https://stackoverflow.com/a/49765239/5936443 [at 11/11/2020]
This method is responsible for generating a document as a ByteArrayInputStream, using an exisiting word template at templatePath
It replaces any keyTags in the document by the corresponding value in the JSONObject fields
it assumes the keyTags come preceeded by the separator "{#" and proceeded by "#}", in the following form: {#keyTag#}
*/
try {
XWPFDocument doc = new XWPFDocument(OPCPackage.open(templatePath));
// search in paragraphs
for(XWPFParagraph p : doc.getParagraphs()){
replaceFieldsParagraph(p, fields);
}
// search in tables
for(XWPFTable t : doc.getTables()){
replaceFieldsTable(t, fields);
}
ByteArrayOutputStream out = new ByteArrayOutputStream();
doc.write(out);
ByteArrayInputStream inputStream = new ByteArrayInputStream(out.toByteArray());
return inputStream;
} catch (IOException e) {
e.printStackTrace();
} catch (InvalidFormatException e) {
e.printStackTrace();
}
return null;
}
public static void replaceFieldsParagraph(XWPFParagraph paragraph, JSONObject fields){
/** this method is responsible for replacing any ocurrences in the paragraph of any of the keyTags
* present in the JSONObject fields by the corresponding value */
String text = paragraph.getText(); //all the text from each run concatenated
String findStr;
if( !text.contains("{#")) //paragraph doesn't have keys to replace
return;
// for each field to replace, search it in the curr paragraph
for( String key : fields.keySet()){
findStr = "{#" + key + "#}";
// if paragraph doesn't have current key, we skip to next key
if( text.contains(findStr)) {
mergeRunsWithSplittedKeyTags(paragraph);
for (XWPFRun run : paragraph.getRuns()) {
// check if current run has current key
checkAndReplaceFieldRun(run, findStr, String.valueOf(fields.get(key)));
}
}
}
}
public static void replaceFieldsTable(XWPFTable table, JSONObject fields){
/** this method is responsible for replacing any ocurrences in the table of any of the keyTags
* present in the JSONObject fields by the corresponding value */
if( table.getNumberOfRows() > 0){
for(XWPFTableRow row : table.getRows()){ // iterate over rows
for( XWPFTableCell cell : row.getTableCells()){ // iterate over columns
if( cell.getParagraphs() != null && cell.getParagraphs().size()>0){
for(XWPFParagraph paragraph : cell.getParagraphs()){ // get cell paragraphs
replaceFieldsParagraph(paragraph, fields); // replacing existing keyTags in paragraph
}
}
}
}
}
}
public static void checkAndReplaceFieldRun(XWPFRun run, String findStr, String value){
String runText = run.getText(0);
if( runText!= null && runText.contains(findStr)){
runText = runText.replace(findStr, value);
run.setText(runText, 0);
}
}
public static void mergeRunsWithSplittedKeyTags(XWPFParagraph paragraph){
/**
A run is a part of the paragraph that has the same formatting.
Word separates the text in paragraphs by different runs in a almost 'random' way,
sometimes the tag we are looking for is splitted across multiple runs.
This method merges the runs that have a keyTag or part of one,
so that the keyTag starting with "{#" and ending with "#}" is in the same run
*/
String runText;
XWPFRun run, nextRun;
List<XWPFRun> runs = paragraph.getRuns();
for( int i=0 ; i<runs.size(); i++){
run = runs.get(i);
runText = run.getText(0);
if( runText != null &&
(runText.contains("{#") || // current run has the complete separator "{#"
(runText.contains("{") && (runs.get(i + 1).getText(0)!=null && runs.get(i + 1).getText(0).substring(0, 1).equals("#"))))){ //current run has the first char, next run has the second char
while( !openTagMatchesCloseTag(runText) ){
nextRun = runs.get(i + 1);
runText = runText + nextRun.getText(0);
paragraph.removeRun(i + 1);
}
run.setText(runText, 0); // if we don't set with arg pos=0 it doesn't replace the contents, it adds to them and repeats chars
}
}
}
public static boolean openTagMatchesCloseTag(String runText){
/** This method validates if we have a complete run.
* Either by having no keyTags present, or by having a complete keyTag.
* If we have parts of a keyTag, but not the complete one, returns false.*/
int incompleteOpenTagCount = runText.split("\\{", -1).length - 1; // "{"
int completeOpenTagCount = runText.split("\\{#", -1).length - 1; // "{#"
int completeCloseTagCount = runText.split("#}", -1).length - 1; // "#}"
if(completeOpenTagCount>0){ // we already have open and close tags, compare the counts
return completeOpenTagCount == completeCloseTagCount;
} else {
if( incompleteOpenTagCount>0 ){ // we only have a "{" not the whole "{#"
return false;
}
}
//doesn't have neither "{" nor "{#", so there's no need to close tags
return true;
}
}