我在 pdfclown 中得到了要求,例如如果有几个关键字是子字符串/与另一个关键字匹配,而突出显示这些关键字必须被覆盖并且应该允许突出显示完整的关键字。例如在下面的地图 ETS 关键字是 just.ETS 的子字符串和 Test.ETS 关键字。并且预期结果应该像我们需要突出显示完整的关键字,例如 just.ETS , Test.ETS 而不是 ETS 关键字及其弹出度量值。. ActualPdf和实际结果 pdf。和jar 路径。
Map<String, String> m = new HashMap<String, String>();
map.put("ETS" , "Loss");
map.put("Just. ETS" , "Net ");
map.put("Test. ETS" , "Profit");
(注意:1.如果文件中已经突出显示大尺寸关键字,那么与大关键字匹配的小尺寸关键字不应突出显示2.如果小尺寸关键字已经突出显示并且该关键字与大关键字匹配,那么大关键字应该突出显示并忽略/取消突出显示小关键字。)。
import java.awt.Color;
import java.awt.Desktop;
import java.awt.geom.Rectangle2D;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.File;
import org.pdfclown.documents.Page;
import org.pdfclown.documents.contents.ITextString;
import org.pdfclown.documents.contents.TextChar;
import org.pdfclown.documents.contents.colorSpaces.DeviceRGBColor;
import org.pdfclown.documents.interaction.annotations.TextMarkup;
import org.pdfclown.documents.interaction.annotations.TextMarkup.MarkupTypeEnum;
import org.pdfclown.files.SerializationModeEnum;
import org.pdfclown.util.math.Interval;
import org.pdfclown.util.math.geom.Quad;
import org.pdfclown.tools.TextExtractor;
public class pdfclown2 {
private static int count;
public static void main(String[] args) throws IOException {
highlight("C:\\Users\\uc23\\Desktop\\pdf\\80743064.pdf","C:\\Users\\\Downloads\\6.pdf");
System.out.println("OK");
}
private static void highlight(String inputPath, String outputPath) throws IOException {
org.pdfclown.files.File file = null;
try {
file = new org.pdfclown.files.File("C:\\Users\\uc239646\\Desktop\\test.pdf");
List<Keyword> l=new ArrayList<Keyword>();
Keyword k=new Keyword();
Keyword k1=new Keyword();
k1.setKey("Just. ETS");
k1.setValue("NET");
l.add(k1);
Keyword k2=new Keyword();
k2.setKey("Test. ETS");
k2.setValue("PROFIT");
l.add(k2);
k.setKey("ETS");
k.setValue("LOSS");
l.add(k);
long startTime = System.currentTimeMillis();
// 2. Iterating through the document pages...
TextExtractor textExtractor = new TextExtractor(true, true);
for (final Page page : file.getDocument().getPages()) {
Map<Rectangle2D, List<ITextString>> textStrings = textExtractor.extract(page);
for (Keyword e : l) {
Pattern pattern;
String serachKey = e.getKey();
final String translationKeyword = e.getValue();
if ((serachKey.contains(")") && serachKey.contains("("))
|| (serachKey.contains("(") && !serachKey.contains(")"))
|| (serachKey.contains(")") && !serachKey.contains("(")) || serachKey.contains("?")
|| serachKey.contains("*") || serachKey.contains("+")) {
pattern = Pattern.compile(Pattern.quote(serachKey), Pattern.CASE_INSENSITIVE);
}
else
pattern = Pattern.compile("\\b"+serachKey+"\\b", Pattern.CASE_INSENSITIVE);
// 2.1. Extract the page text!
//System.out.println(textStrings.toString().indexOf(entry.getKey()));
// 2.2. Find the text pattern matches!
final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings).toLowerCase());
// 2.3. Highlight the text pattern matches!
//System.out.println(textStrings);
textExtractor.filter(textStrings, new TextExtractor.IIntervalFilter() {
public boolean hasNext() {
// if(key.getMatchCriteria() == 1){
if (matcher.find()) {
return true;
}
/*
* } else if(key.getMatchCriteria() == 2) { if
*
*
*
*
*
*
*
*
* (matcher.hitEnd()) { count++; return true; } }
*/
return false;
}
public Interval<Integer> next() {
return new Interval<Integer>(matcher.start(), matcher.end());
}
public void process(Interval<Integer> interval, ITextString match) {
System.out.println(match);
// Defining the highlight box of the text pattern
// match...
/*List l=new ArrayList();
if(!l.contains(match)){
System.out.println("map.put("+match+","+translationKeyword+")");
}
*/
List<Quad> highlightQuads = new ArrayList<Quad>();
{
Rectangle2D textBox = null;
for (TextChar textChar : match.getTextChars()) {
Rectangle2D textCharBox = textChar.getBox();
if (textBox == null) {
textBox = (Rectangle2D) textCharBox.clone();
} else {
if (textCharBox.getY() > textBox.getMaxY()) {
highlightQuads.add(Quad.get(textBox));
textBox = (Rectangle2D) textCharBox.clone();
} else {
textBox.add(textCharBox);
}
}
System.out.println(highlightQuads.contains(textBox));
textBox.setRect(textBox.getX(), textBox.getY(), textBox.getWidth(), textBox.getHeight());
highlightQuads.add(Quad.get(textBox));
}
/* List<Quad> highlightQuads = new ArrayList<Quad>();
List<TextChar> textChars = match.getTextChars();
Rectangle2D firstRect = textChars.get(0).getBox();
Rectangle2D lastRect = textChars.get(textChars.size()-1).getBox();
Rectangle2D rect = firstRect.createUnion(lastRect);
highlightQuads.add(Quad.get(rect));*/
// subtype can be Highlight, Underline, StrikeOut, Squiggly
new TextMarkup(page, highlightQuads, translationKeyword, MarkupTypeEnum.Highlight);
}
}
public void remove() {
throw new UnsupportedOperationException();
}
});
}
}
SerializationModeEnum serializationMode = SerializationModeEnum.Standard;
file.save(new java.io.File(outputPath), serializationMode);
System.out.println("file created");
long endTime = System.currentTimeMillis();
System.out.println("seconds take for execution is:"+(endTime-startTime)/1000);
} catch (Exception e) {
e.printStackTrace();
}
}
}