荧光笔比解释器好,它更快。您可以在突出显示标签后提取标签之间的匹配短语。
Java正则表达式提取标签之间的文本
public class HighlightDemo {
Directory directory;
Analyzer analyzer;
String[] contents = {"running in the park",
"I was jogging in the park this morning",
"running on the road",
"The famous New York Marathon has its final miles in Central park every year and it's easy to understand why: the park, with a variety of terrain and excellent scenery, is the ultimate runner's dream. With its many paths that range in level of difficulty, Central Park allows a runner to experience clarity and freedom in this picturesque urban oasis."};
@Before
public void setUp() throws IOException {
directory = new RAMDirectory();
analyzer = new WhitespaceAnalyzer();
// indexed documents
IndexWriter writer = new IndexWriter(directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
for (int i = 0; i < contents.length; i++) {
Document doc = new Document();
doc.add(new Field("content", contents[i], Field.Store.NO, Field.Index.ANALYZED)); // store & index
doc.add(new NumericField("id", Field.Store.YES, true).setIntValue(i)); // store & index
writer.addDocument(doc);
}
writer.close();
}
@Test
public void test() throws IOException, ParseException, InvalidTokenOffsetsException {
IndexSearcher s = new IndexSearcher(directory);
QueryParser parser = new QueryParser(Version.LUCENE_36, "content", analyzer);
org.apache.lucene.search.Query query = parser.parse("park");
TopDocs hits = s.search(query, 10);
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
for (int i = 0; i < hits.scoreDocs.length; i++) {
int id = hits.scoreDocs[i].doc;
Document doc = s.doc(id);
String text = contents[Integer.parseInt(s.doc(id).get("id"))];
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text));
org.apache.lucene.search.highlight.TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);
for (int j = 0; j < frag.length; j++) {
if ((frag[j] != null) && (frag[j].getScore() > 0)) {
assertTrue(frag[j].toString().contains("<B>"));
assertTrue(frag[j].toString().contains("</B>"));
System.out.println(frag[j].toString());
}
}
}
}
}